No OneTemporary
Actions

Size

4 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/contrib/llvm-project/clang/include/clang/Basic/TargetOptions.h b/contrib/llvm-project/clang/include/clang/Basic/TargetOptions.h
	index bbe86aebb074..4a5d469b8e54 100644
	--- a/contrib/llvm-project/clang/include/clang/Basic/TargetOptions.h
	+++ b/contrib/llvm-project/clang/include/clang/Basic/TargetOptions.h
	@@ -1,88 +1,92 @@
	//===--- TargetOptions.h ----------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// Defines the clang::TargetOptions class.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_BASIC_TARGETOPTIONS_H
	#define LLVM_CLANG_BASIC_TARGETOPTIONS_H

	#include "clang/Basic/OpenCLOptions.h"
	#include "llvm/Support/VersionTuple.h"
	#include "llvm/Target/TargetOptions.h"
	#include <string>
	#include <vector>

	namespace clang {

	/// Options for controlling the target.
	class TargetOptions {
	public:
	/// The name of the target triple to compile for.
	std::string Triple;

	/// When compiling for the device side, contains the triple used to compile
	/// for the host.
	std::string HostTriple;

	/// If given, the name of the target CPU to generate code for.
	std::string CPU;

	/// If given, the unit to use for floating point math.
	std::string FPMath;

	/// If given, the name of the target ABI to use.
	std::string ABI;

	/// The EABI version to use
	llvm::EABI EABIVersion;

	/// If given, the version string of the linker in use.
	std::string LinkerVersion;

	/// The list of target specific features to enable or disable, as written on the command line.
	std::vector<std::string> FeaturesAsWritten;

	/// The list of target specific features to enable or disable -- this should
	/// be a list of strings starting with by '+' or '-'.
	std::vector<std::string> Features;

	+ /// The map of which features have been enabled disabled based on the command
	+ /// line.
	+ llvm::StringMap<bool> FeatureMap;
	+
	/// Supported OpenCL extensions and optional core features.
	OpenCLOptions SupportedOpenCLOptions;

	/// The list of OpenCL extensions to enable or disable, as written on
	/// the command line.
	std::vector<std::string> OpenCLExtensionsAsWritten;

	/// If given, enables support for __int128_t and __uint128_t types.
	bool ForceEnableInt128 = false;

	/// \brief If enabled, use 32-bit pointers for accessing const/local/shared
	/// address space.
	bool NVPTXUseShortPointers = false;

	// The code model to be used as specified by the user. Corresponds to
	// CodeModel::Model enum defined in include/llvm/Support/CodeGen.h, plus
	// "default" for the case when the user has not explicitly specified a
	// code model.
	std::string CodeModel;

	/// The version of the SDK which was used during the compilation.
	/// The option is used for two different purposes:
	/// * on darwin the version is propagated to LLVM where it's used
	/// to support SDK Version metadata (See D55673).
	/// * CUDA compilation uses it to control parts of CUDA compilation
	/// in clang that depend on specific version of the CUDA SDK.
	llvm::VersionTuple SDKVersion;
	};

	} // end namespace clang

	#endif
	diff --git a/contrib/llvm-project/clang/include/clang/Driver/Options.td b/contrib/llvm-project/clang/include/clang/Driver/Options.td
	index b20b8a288221..f818acb39d51 100644
	--- a/contrib/llvm-project/clang/include/clang/Driver/Options.td
	+++ b/contrib/llvm-project/clang/include/clang/Driver/Options.td
	@@ -1,4850 +1,4850 @@
	//===--- Options.td - Options for clang -----------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the options accepted by clang.
	//
	//===----------------------------------------------------------------------===//

	// Include the common option parsing interfaces.
	include "llvm/Option/OptParser.td"

	/////////
	// Flags

	// DriverOption - The option is a "driver" option, and should not be forwarded
	// to other tools.
	def DriverOption : OptionFlag;

	// LinkerInput - The option is a linker input.
	def LinkerInput : OptionFlag;

	// NoArgumentUnused - Don't report argument unused warnings for this option; this
	// is useful for options like -static or -dynamic which a user may always end up
	// passing, even if the platform defaults to (or only supports) that option.
	def NoArgumentUnused : OptionFlag;

	// Unsupported - The option is unsupported, and the driver will reject command
	// lines that use it.
	def Unsupported : OptionFlag;

	// Ignored - The option is unsupported, and the driver will silently ignore it.
	def Ignored : OptionFlag;

	// CoreOption - This is considered a "core" Clang option, available in both
	// clang and clang-cl modes.
	def CoreOption : OptionFlag;

	// CLOption - This is a cl.exe compatibility option. Options with this flag
	// are made available when the driver is running in CL compatibility mode.
	def CLOption : OptionFlag;

	// CC1Option - This option should be accepted by clang -cc1.
	def CC1Option : OptionFlag;

	// CC1AsOption - This option should be accepted by clang -cc1as.
	def CC1AsOption : OptionFlag;

	// NoDriverOption - This option should not be accepted by the driver.
	def NoDriverOption : OptionFlag;

	// A short name to show in documentation. The name will be interpreted as rST.
	class DocName<string name> { string DocName = name; }

	// A brief description to show in documentation, interpreted as rST.
	class DocBrief<code descr> { code DocBrief = descr; }

	// Indicates that this group should be flattened into its parent when generating
	// documentation.
	class DocFlatten { bit DocFlatten = 1; }

	// Indicates that this warning is ignored, but accepted with a warning for
	// GCC compatibility.
	class IgnoredGCCCompat : Flags<[HelpHidden]> {}

	/////////
	// Groups

	def Action_Group : OptionGroup<"<action group>">, DocName<"Actions">,
	DocBrief<[{The action to perform on the input.}]>;

	// Meta-group for options which are only used for compilation,
	// and not linking etc.
	def CompileOnly_Group : OptionGroup<"<CompileOnly group>">,
	DocName<"Compilation flags">, DocBrief<[{
	Flags controlling the behavior of Clang during compilation. These flags have
	no effect during actions that do not perform compilation.}]>;

	def Preprocessor_Group : OptionGroup<"<Preprocessor group>">,
	Group<CompileOnly_Group>,
	DocName<"Preprocessor flags">, DocBrief<[{
	Flags controlling the behavior of the Clang preprocessor.}]>;

	def IncludePath_Group : OptionGroup<"<I/i group>">, Group<Preprocessor_Group>,
	DocName<"Include path management">,
	DocBrief<[{
	Flags controlling how ``#include``\s are resolved to files.}]>;

	def I_Group : OptionGroup<"<I group>">, Group<IncludePath_Group>, DocFlatten;
	def i_Group : OptionGroup<"<i group>">, Group<IncludePath_Group>, DocFlatten;
	def clang_i_Group : OptionGroup<"<clang i group>">, Group<i_Group>, DocFlatten;

	def M_Group : OptionGroup<"<M group>">, Group<Preprocessor_Group>,
	DocName<"Dependency file generation">, DocBrief<[{
	Flags controlling generation of a dependency file for ``make``-like build
	systems.}]>;

	def d_Group : OptionGroup<"<d group>">, Group<Preprocessor_Group>,
	DocName<"Dumping preprocessor state">, DocBrief<[{
	Flags allowing the state of the preprocessor to be dumped in various ways.}]>;

	def Diag_Group : OptionGroup<"<W/R group>">, Group<CompileOnly_Group>,
	DocName<"Diagnostic flags">, DocBrief<[{
	Flags controlling which warnings, errors, and remarks Clang will generate.
	See the :doc:`full list of warning and remark flags <DiagnosticsReference>`.}]>;

	def R_Group : OptionGroup<"<R group>">, Group<Diag_Group>, DocFlatten;
	def R_value_Group : OptionGroup<"<R (with value) group>">, Group<R_Group>,
	DocFlatten;
	def W_Group : OptionGroup<"<W group>">, Group<Diag_Group>, DocFlatten;
	def W_value_Group : OptionGroup<"<W (with value) group>">, Group<W_Group>,
	DocFlatten;

	def f_Group : OptionGroup<"<f group>">, Group<CompileOnly_Group>,
	DocName<"Target-independent compilation options">;

	def f_clang_Group : OptionGroup<"<f (clang-only) group>">,
	Group<CompileOnly_Group>, DocFlatten;
	def pedantic_Group : OptionGroup<"<pedantic group>">, Group<f_Group>,
	DocFlatten;
	def opencl_Group : OptionGroup<"<opencl group>">, Group<f_Group>,
	DocName<"OpenCL flags">;

	def sycl_Group : OptionGroup<"<SYCL group>">, Group<f_Group>,
	DocName<"SYCL flags">;

	def m_Group : OptionGroup<"<m group>">, Group<CompileOnly_Group>,
	DocName<"Target-dependent compilation options">;

	// Feature groups - these take command line options that correspond directly to
	// target specific features and can be translated directly from command line
	// options.
	def m_aarch64_Features_Group : OptionGroup<"<aarch64 features group>">,
	Group<m_Group>, DocName<"AARCH64">;
	def m_amdgpu_Features_Group : OptionGroup<"<amdgpu features group>">,
	Group<m_Group>, DocName<"AMDGPU">;
	def m_arm_Features_Group : OptionGroup<"<arm features group>">,
	Group<m_Group>, DocName<"ARM">;
	def m_hexagon_Features_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	// The features added by this group will not be added to target features.
	// These are explicitly handled.
	def m_hexagon_Features_HVX_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	def m_mips_Features_Group : OptionGroup<"<mips features group>">,
	Group<m_Group>, DocName<"MIPS">;
	def m_ppc_Features_Group : OptionGroup<"<ppc features group>">,
	Group<m_Group>, DocName<"PowerPC">;
	def m_wasm_Features_Group : OptionGroup<"<wasm features group>">,
	Group<m_Group>, DocName<"WebAssembly">;
	// The features added by this group will not be added to target features.
	// These are explicitly handled.
	def m_wasm_Features_Driver_Group : OptionGroup<"<wasm driver features group>">,
	Group<m_Group>, DocName<"WebAssembly Driver">;
	def m_x86_Features_Group : OptionGroup<"<x86 features group>">,
	Group<m_Group>, Flags<[CoreOption]>, DocName<"X86">;
	def m_riscv_Features_Group : OptionGroup<"<riscv features group>">,
	Group<m_Group>, DocName<"RISCV">;

	def m_libc_Group : OptionGroup<"<m libc group>">, Group<m_mips_Features_Group>,
	Flags<[HelpHidden]>;

	def O_Group : OptionGroup<"<O group>">, Group<CompileOnly_Group>,
	DocName<"Optimization level">, DocBrief<[{
	Flags controlling how much optimization should be performed.}]>;

	def DebugInfo_Group : OptionGroup<"<g group>">, Group<CompileOnly_Group>,
	DocName<"Debug information generation">, DocBrief<[{
	Flags controlling how much and what kind of debug information should be
	generated.}]>;

	def g_Group : OptionGroup<"<g group>">, Group<DebugInfo_Group>,
	DocName<"Kind and level of debug information">;
	def gN_Group : OptionGroup<"<gN group>">, Group<g_Group>,
	DocName<"Debug level">;
	def ggdbN_Group : OptionGroup<"<ggdbN group>">, Group<gN_Group>, DocFlatten;
	def gTune_Group : OptionGroup<"<gTune group>">, Group<g_Group>,
	DocName<"Debugger to tune debug information for">;
	def g_flags_Group : OptionGroup<"<g flags group>">, Group<DebugInfo_Group>,
	DocName<"Debug information flags">;

	def StaticAnalyzer_Group : OptionGroup<"<Static analyzer group>">,
	DocName<"Static analyzer flags">, DocBrief<[{
	Flags controlling the behavior of the Clang Static Analyzer.}]>;

	// gfortran options that we recognize in the driver and pass along when
	// invoking GCC to compile Fortran code.
	def gfortran_Group : OptionGroup<"<gfortran group>">,
	DocName<"Fortran compilation flags">, DocBrief<[{
	Flags that will be passed onto the ``gfortran`` compiler when Clang is given
	a Fortran input.}]>;

	def Link_Group : OptionGroup<"<T/e/s/t/u group>">, DocName<"Linker flags">,
	DocBrief<[{Flags that are passed on to the linker}]>;
	def T_Group : OptionGroup<"<T group>">, Group<Link_Group>, DocFlatten;
	def u_Group : OptionGroup<"<u group>">, Group<Link_Group>, DocFlatten;

	def reserved_lib_Group : OptionGroup<"<reserved libs group>">,
	Flags<[Unsupported]>;

	// Temporary groups for clang options which we know we don't support,
	// but don't want to verbosely warn the user about.
	def clang_ignored_f_Group : OptionGroup<"<clang ignored f group>">,
	Group<f_Group>, Flags<[Ignored]>;
	def clang_ignored_m_Group : OptionGroup<"<clang ignored m group>">,
	Group<m_Group>, Flags<[Ignored]>;

	// Group for clang options in the process of deprecation.
	// Please include the version that deprecated the flag as comment to allow
	// easier garbage collection.
	def clang_ignored_legacy_options_Group : OptionGroup<"<clang legacy flags>">,
	Group<f_Group>, Flags<[Ignored]>;

	// Retired with clang-5.0
	def : Flag<["-"], "fslp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
	def : Flag<["-"], "fno-slp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;

	// Retired with clang-10.0. Previously controlled X86 MPX ISA.
	def mmpx : Flag<["-"], "mmpx">, Group<clang_ignored_legacy_options_Group>;
	def mno_mpx : Flag<["-"], "mno-mpx">, Group<clang_ignored_legacy_options_Group>;

	// Group that ignores all gcc optimizations that won't be implemented
	def clang_ignored_gcc_optimization_f_Group : OptionGroup<
	"<clang_ignored_gcc_optimization_f_Group>">, Group<f_Group>, Flags<[Ignored]>;

	// A boolean option which is opt-in in CC1. The positive option exists in CC1 and
	// Args.hasArg(OPT_ffoo) is used to check that the flag is enabled.
	// This is useful if the option is usually disabled.
	multiclass OptInFFlag<string name, string pos_prefix, string neg_prefix="",
	string help="", list<OptionFlag> flags=[]> {
	def f#NAME : Flag<["-"], "f"#name>, Flags<!listconcat([CC1Option], flags)>,
	Group<f_Group>, HelpText<!strconcat(pos_prefix, help)>;
	def fno_#NAME : Flag<["-"], "fno-"#name>, Flags<flags>,
	Group<f_Group>, HelpText<!strconcat(neg_prefix, help)>;
	}

	// A boolean option which is opt-out in CC1. The negative option exists in CC1 and
	// Args.hasArg(OPT_fno_foo) is used to check that the flag is disabled.
	multiclass OptOutFFlag<string name, string pos_prefix, string neg_prefix,
	string help="", list<OptionFlag> flags=[]> {
	def f#NAME : Flag<["-"], "f"#name>, Flags<flags>,
	Group<f_Group>, HelpText<!strconcat(pos_prefix, help)>;
	def fno_#NAME : Flag<["-"], "fno-"#name>, Flags<!listconcat([CC1Option], flags)>,
	Group<f_Group>, HelpText<!strconcat(neg_prefix, help)>;
	}

	/////////
	// Options

	// The internal option ID must be a valid C++ identifier and results in a
	// clang::driver::options::OPT_XX enum constant for XX.
	//
	// We want to unambiguously be able to refer to options from the driver source
	// code, for this reason the option name is mangled into an ID. This mangling
	// isn't guaranteed to have an inverse, but for practical purposes it does.
	//
	// The mangling scheme is to ignore the leading '-', and perform the following
	// substitutions:
	// _ => __
	// - => _
	// / => _SLASH
	// # => _HASH
	// ? => _QUESTION
	// , => _COMMA
	// = => _EQ
	// C++ => CXX
	// . => _

	// Developer Driver Options

	def internal_Group : OptionGroup<"<clang internal options>">, Flags<[HelpHidden]>;
	def internal_driver_Group : OptionGroup<"<clang driver internal options>">,
	Group<internal_Group>, HelpText<"DRIVER OPTIONS">;
	def internal_debug_Group :
	OptionGroup<"<clang debug/development internal options>">,
	Group<internal_Group>, HelpText<"DEBUG/DEVELOPMENT OPTIONS">;

	class InternalDriverOpt : Group<internal_driver_Group>,
	Flags<[DriverOption, HelpHidden]>;
	def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
	def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
	def ccc_gcc_name : Separate<["-"], "ccc-gcc-name">, InternalDriverOpt,
	HelpText<"Name for native GCC compiler">,
	MetaVarName<"<gcc-path>">;

	class InternalDebugOpt : Group<internal_debug_Group>,
	Flags<[DriverOption, HelpHidden, CoreOption]>;
	def ccc_install_dir : Separate<["-"], "ccc-install-dir">, InternalDebugOpt,
	HelpText<"Simulate installation in the given directory">;
	def ccc_print_phases : Flag<["-"], "ccc-print-phases">, InternalDebugOpt,
	HelpText<"Dump list of actions to perform">;
	def ccc_print_bindings : Flag<["-"], "ccc-print-bindings">, InternalDebugOpt,
	HelpText<"Show bindings of tools to actions">;

	def ccc_arcmt_check : Flag<["-"], "ccc-arcmt-check">, InternalDriverOpt,
	HelpText<"Check for ARC migration issues that need manual handling">;
	def ccc_arcmt_modify : Flag<["-"], "ccc-arcmt-modify">, InternalDriverOpt,
	HelpText<"Apply modifications to files to conform to ARC">;
	def ccc_arcmt_migrate : Separate<["-"], "ccc-arcmt-migrate">, InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files that conform to ARC">;
	def arcmt_migrate_report_output : Separate<["-"], "arcmt-migrate-report-output">,
	HelpText<"Output path for the plist report">, Flags<[CC1Option]>;
	def arcmt_migrate_emit_arc_errors : Flag<["-"], "arcmt-migrate-emit-errors">,
	HelpText<"Emit ARC errors even if the migrator can fix them">,
	Flags<[CC1Option]>;
	def gen_reproducer: Flag<["-"], "gen-reproducer">, InternalDebugOpt,
	HelpText<"Auto-generates preprocessed source files and a reproduction script">;
	def gen_cdb_fragment_path: Separate<["-"], "gen-cdb-fragment-path">, InternalDebugOpt,
	HelpText<"Emit a compilation database fragment to the specified directory">;

	def _migrate : Flag<["--"], "migrate">, Flags<[DriverOption]>,
	HelpText<"Run the migrator">;
	def ccc_objcmt_migrate : Separate<["-"], "ccc-objcmt-migrate">,
	InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files to migrate to "
	"modern ObjC syntax">;
	def objcmt_migrate_literals : Flag<["-"], "objcmt-migrate-literals">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC literals">;
	def objcmt_migrate_subscripting : Flag<["-"], "objcmt-migrate-subscripting">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC subscripting">;
	def objcmt_migrate_property : Flag<["-"], "objcmt-migrate-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC property">;
	def objcmt_migrate_all : Flag<["-"], "objcmt-migrate-all">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC">;
	def objcmt_migrate_readonly_property : Flag<["-"], "objcmt-migrate-readonly-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readonly property">;
	def objcmt_migrate_readwrite_property : Flag<["-"], "objcmt-migrate-readwrite-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readwrite property">;
	def objcmt_migrate_property_dot_syntax : Flag<["-"], "objcmt-migrate-property-dot-syntax">, Flags<[CC1Option]>,
	HelpText<"Enable migration of setter/getter messages to property-dot syntax">;
	def objcmt_migrate_annotation : Flag<["-"], "objcmt-migrate-annotation">, Flags<[CC1Option]>,
	HelpText<"Enable migration to property and method annotations">;
	def objcmt_migrate_instancetype : Flag<["-"], "objcmt-migrate-instancetype">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer instancetype for method result type">;
	def objcmt_migrate_nsmacros : Flag<["-"], "objcmt-migrate-ns-macros">, Flags<[CC1Option]>,
	HelpText<"Enable migration to NS_ENUM/NS_OPTIONS macros">;
	def objcmt_migrate_protocol_conformance : Flag<["-"], "objcmt-migrate-protocol-conformance">, Flags<[CC1Option]>,
	HelpText<"Enable migration to add protocol conformance on classes">;
	def objcmt_atomic_property : Flag<["-"], "objcmt-atomic-property">, Flags<[CC1Option]>,
	HelpText<"Make migration to 'atomic' properties">;
	def objcmt_returns_innerpointer_property : Flag<["-"], "objcmt-returns-innerpointer-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to annotate property with NS_RETURNS_INNER_POINTER">;
	def objcmt_ns_nonatomic_iosonly: Flag<["-"], "objcmt-ns-nonatomic-iosonly">, Flags<[CC1Option]>,
	HelpText<"Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute">;
	def objcmt_migrate_designated_init : Flag<["-"], "objcmt-migrate-designated-init">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods">;
	def objcmt_whitelist_dir_path: Joined<["-"], "objcmt-whitelist-dir-path=">, Flags<[CC1Option]>,
	HelpText<"Only modify files with a filename contained in the provided directory path">;
	// The misspelt "white-list" [sic] alias is due for removal.
	def : Joined<["-"], "objcmt-white-list-dir-path=">, Flags<[CC1Option]>,
	Alias<objcmt_whitelist_dir_path>;

	// Make sure all other -ccc- options are rejected.
	def ccc_ : Joined<["-"], "ccc-">, Group<internal_Group>, Flags<[Unsupported]>;

	// Standard Options

	def _HASH_HASH_HASH : Flag<["-"], "###">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Print (but do not run) the commands to run for this compilation">;
	def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>,
	Flags<[DriverOption, CoreOption]>;
	def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<dir>">,
	HelpText<"Add <dir> to search path for binaries and object files used implicitly">;
	def CC : Flag<["-"], "CC">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments from within macros in preprocessed output">;
	def C : Flag<["-"], "C">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments in preprocessed output">;
	def D : JoinedOrSeparate<["-"], "D">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>=<value>">,
	HelpText<"Define <macro> to <value> (or 1 if <value> omitted)">;
	def E : Flag<["-"], "E">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run the preprocessor">;
	def F : JoinedOrSeparate<["-"], "F">, Flags<[RenderJoined,CC1Option]>,
	HelpText<"Add directory to framework include search path">;
	def G : JoinedOrSeparate<["-"], "G">, Flags<[DriverOption]>, Group<m_Group>,
	MetaVarName<"<size>">, HelpText<"Put objects of at most <size> bytes "
	"into small data section (MIPS / Hexagon)">;
	def G_EQ : Joined<["-"], "G=">, Flags<[DriverOption]>, Group<m_Group>, Alias<G>;
	def H : Flag<["-"], "H">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Show header includes and nesting depth">;
	def I_ : Flag<["-"], "I-">, Group<I_Group>,
	HelpText<"Restrict all prior -I flags to double-quoted inclusion and "
	"remove current directory from include path">;
	def I : JoinedOrSeparate<["-"], "I">, Group<I_Group>,
	Flags<[CC1Option,CC1AsOption]>, MetaVarName<"<dir>">,
	HelpText<"Add directory to include search path">;
	def L : JoinedOrSeparate<["-"], "L">, Flags<[RenderJoined]>, Group<Link_Group>,
	MetaVarName<"<dir>">, HelpText<"Add directory to library search path">;
	def MD : Flag<["-"], "MD">, Group<M_Group>,
	HelpText<"Write a depfile containing user and system headers">;
	def MMD : Flag<["-"], "MMD">, Group<M_Group>,
	HelpText<"Write a depfile containing user headers">;
	def M : Flag<["-"], "M">, Group<M_Group>,
	HelpText<"Like -MD, but also implies -E and writes to stdout by default">;
	def MM : Flag<["-"], "MM">, Group<M_Group>,
	HelpText<"Like -MMD, but also implies -E and writes to stdout by default">;
	def MF : JoinedOrSeparate<["-"], "MF">, Group<M_Group>,
	HelpText<"Write depfile output from -MMD, -MD, -MM, or -M to <file>">,
	MetaVarName<"<file>">;
	def MG : Flag<["-"], "MG">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Add missing headers to depfile">;
	def MJ : JoinedOrSeparate<["-"], "MJ">, Group<M_Group>,
	HelpText<"Write a compilation database entry per input">;
	def MP : Flag<["-"], "MP">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Create phony target for each dependency (other than main file)">;
	def MQ : JoinedOrSeparate<["-"], "MQ">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output to quote in depfile">;
	def MT : JoinedOrSeparate<["-"], "MT">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output in depfile">;
	def MV : Flag<["-"], "MV">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Use NMake/Jom format for the depfile">;
	def Mach : Flag<["-"], "Mach">, Group<Link_Group>;
	def O0 : Flag<["-"], "O0">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def O4 : Flag<["-"], "O4">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def ObjCXX : Flag<["-"], "ObjC++">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C++ inputs">;
	def ObjC : Flag<["-"], "ObjC">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C inputs">;
	def O : Joined<["-"], "O">, Group<O_Group>, Flags<[CC1Option]>;
	def O_flag : Flag<["-"], "O">, Flags<[CC1Option]>, Alias<O>, AliasArgs<["1"]>;
	def Ofast : Joined<["-"], "Ofast">, Group<O_Group>, Flags<[CC1Option]>;
	def P : Flag<["-"], "P">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Disable linemarker output in -E mode">;
	def Qy : Flag<["-"], "Qy">, Flags<[CC1Option]>,
	HelpText<"Emit metadata containing compiler name and version">;
	def Qn : Flag<["-"], "Qn">, Flags<[CC1Option]>,
	HelpText<"Do not emit metadata containing compiler name and version">;
	def : Flag<["-"], "fident">, Group<f_Group>, Alias<Qy>, Flags<[CC1Option]>;
	def : Flag<["-"], "fno-ident">, Group<f_Group>, Alias<Qn>, Flags<[CC1Option]>;
	def Qunused_arguments : Flag<["-"], "Qunused-arguments">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Don't emit warning for unused driver arguments">;
	def Q : Flag<["-"], "Q">, IgnoredGCCCompat;
	def Rpass_EQ : Joined<["-"], "Rpass=">, Group<R_value_Group>, Flags<[CC1Option]>,
	HelpText<"Report transformations performed by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_missed_EQ : Joined<["-"], "Rpass-missed=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report missed transformations by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_analysis_EQ : Joined<["-"], "Rpass-analysis=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report transformation analysis from optimization passes whose "
	"name matches the given POSIX regular expression">;
	def R_Joined : Joined<["-"], "R">, Group<R_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<remark>">, HelpText<"Enable the specified remark">;
	def S : Flag<["-"], "S">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run preprocess and compilation steps">;
	def Tbss : JoinedOrSeparate<["-"], "Tbss">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def Tdata : JoinedOrSeparate<["-"], "Tdata">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of DATA to <addr>">;
	def Ttext : JoinedOrSeparate<["-"], "Ttext">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of TEXT to <addr>">;
	def T : JoinedOrSeparate<["-"], "T">, Group<T_Group>,
	MetaVarName<"<script>">, HelpText<"Specify <script> as linker script">;
	def U : JoinedOrSeparate<["-"], "U">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>">, HelpText<"Undefine macro <macro>">;
	def V : JoinedOrSeparate<["-"], "V">, Flags<[DriverOption, Unsupported]>;
	def Wa_COMMA : CommaJoined<["-"], "Wa,">,
	HelpText<"Pass the comma separated arguments in <arg> to the assembler">,
	MetaVarName<"<arg>">;
	def Wall : Flag<["-"], "Wall">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def WCL4 : Flag<["-"], "WCL4">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>, Flags<[CC1Option]>,
	HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
	def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>, Flags<[CC1Option]>;
	def Wl_COMMA : CommaJoined<["-"], "Wl,">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass the comma separated arguments in <arg> to the linker">,
	MetaVarName<"<arg>">, Group<Link_Group>;
	// FIXME: This is broken; these should not be Joined arguments.
	def Wno_nonportable_cfstrings : Joined<["-"], "Wno-nonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wnonportable_cfstrings : Joined<["-"], "Wnonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wp_COMMA : CommaJoined<["-"], "Wp,">,
	HelpText<"Pass the comma separated arguments in <arg> to the preprocessor">,
	MetaVarName<"<arg>">, Group<Preprocessor_Group>;
	def Wundef_prefix_EQ : CommaJoined<["-"], "Wundef-prefix=">, Group<W_value_Group>,
	Flags<[CC1Option, CoreOption, HelpHidden]>, MetaVarName<"<arg>">,
	HelpText<"Enable warnings for undefined macros with a prefix in the comma separated list <arg>">;
	def Wwrite_strings : Flag<["-"], "Wwrite-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wno_write_strings : Flag<["-"], "Wno-write-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def W_Joined : Joined<["-"], "W">, Group<W_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
	def Xanalyzer : Separate<["-"], "Xanalyzer">,
	HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
	Group<StaticAnalyzer_Group>;
	def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[DriverOption]>;
	def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[DriverOption]>,
	HelpText<"Pass <arg> to the CUDA/HIP host compilation">, MetaVarName<"<arg>">;
	def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[DriverOption]>,
	HelpText<"Pass <arg> to the CUDA/HIP device compilation">, MetaVarName<"<arg>">;
	def Xassembler : Separate<["-"], "Xassembler">,
	HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
	Group<CompileOnly_Group>;
	def Xclang : Separate<["-"], "Xclang">,
	HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
	Flags<[DriverOption, CoreOption]>, Group<CompileOnly_Group>;
	def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
	HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
	def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
	HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
	def Xopenmp_target : Separate<["-"], "Xopenmp-target">,
	HelpText<"Pass <arg> to the target offloading toolchain.">, MetaVarName<"<arg>">;
	def Xopenmp_target_EQ : JoinedAndSeparate<["-"], "Xopenmp-target=">,
	HelpText<"Pass <arg> to the target offloading toolchain identified by <triple>.">,
	MetaVarName<"<triple> <arg>">;
	def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xpreprocessor : Separate<["-"], "Xpreprocessor">, Group<Preprocessor_Group>,
	HelpText<"Pass <arg> to the preprocessor">, MetaVarName<"<arg>">;
	def X_Flag : Flag<["-"], "X">, Group<Link_Group>;
	def X_Joined : Joined<["-"], "X">, IgnoredGCCCompat;
	def Z_Flag : Flag<["-"], "Z">, Group<Link_Group>;
	// FIXME: All we do with this is reject it. Remove.
	def Z_Joined : Joined<["-"], "Z">;
	def all__load : Flag<["-"], "all_load">;
	def allowable__client : Separate<["-"], "allowable_client">;
	def ansi : Flag<["-", "--"], "ansi">;
	def arch__errors__fatal : Flag<["-"], "arch_errors_fatal">;
	def arch : Separate<["-"], "arch">, Flags<[DriverOption]>;
	def arch__only : Separate<["-"], "arch_only">;
	def a : Joined<["-"], "a">;
	def autocomplete : Joined<["--"], "autocomplete=">;
	def bind__at__load : Flag<["-"], "bind_at_load">;
	def bundle__loader : Separate<["-"], "bundle_loader">;
	def bundle : Flag<["-"], "bundle">;
	def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
	def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
	def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option is added for compatibility with OpenCL 1.0.">;
	def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
	def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
	def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Generate kernel argument metadata.">;
	def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow unsafe floating-point optimizations. Also implies -cl-no-signed-zeros and -cl-mad-enable.">;
	def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.">;
	def cl_mad_enable : Flag<["-"], "cl-mad-enable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise MAD computations in the generated binary.">;
	def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
	def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,clc++,CLC++">;
	def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>,
	HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
	def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
	def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">;
	def client__name : JoinedOrSeparate<["-"], "client_name">;
	def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
	def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
	def config : Separate<["--"], "config">, Flags<[DriverOption]>,
	HelpText<"Specifies configuration file">;
	def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOption, HelpHidden]>,
	HelpText<"System directory for configuration files">;
	def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>,
	HelpText<"User directory for configuration files">;
	def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>;
	def cpp_precomp : Flag<["-"], "cpp-precomp">, Group<clang_ignored_f_Group>;
	def current__version : JoinedOrSeparate<["-"], "current_version">;
	def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>,
	HelpText<"Add directory to the C++ SYSTEM include search path">, Flags<[CC1Option]>,
	MetaVarName<"<directory>">;
	def c : Flag<["-"], "c">, Flags<[DriverOption]>, Group<Action_Group>,
	HelpText<"Only run preprocess, compile, and assemble steps">;
	def fconvergent_functions : Flag<["-"], "fconvergent-functions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assume functions may be convergent">;

	def gpu_use_aux_triple_only : Flag<["--"], "gpu-use-aux-triple-only">,
	InternalDriverOpt, HelpText<"Prepare '-aux-triple' only without populating "
	"'-aux-target-cpu' and '-aux-target-feature'.">;
	def cuda_device_only : Flag<["--"], "cuda-device-only">,
	HelpText<"Compile CUDA code for device only">;
	def cuda_host_only : Flag<["--"], "cuda-host-only">,
	HelpText<"Compile CUDA code for host only. Has no effect on non-CUDA "
	"compilations.">;
	def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
	HelpText<"Compile CUDA code for both host and device (default). Has no "
	"effect on non-CUDA compilations.">;
	def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">, Flags<[DriverOption]>,
	HelpText<"Include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
	def no_cuda_include_ptx_EQ : Joined<["--"], "no-cuda-include-ptx=">, Flags<[DriverOption]>,
	HelpText<"Do not include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
	def offload_arch_EQ : Joined<["--"], "offload-arch=">, Flags<[DriverOption]>,
	HelpText<"CUDA/HIP offloading device architecture (e.g. sm_35, gfx906). May be specified more than once.">;
	def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
	Alias<offload_arch_EQ>;
	def hip_link : Flag<["--"], "hip-link">,
	HelpText<"Link clang-offload-bundler bundles for HIP">;
	def no_offload_arch_EQ : Joined<["--"], "no-offload-arch=">, Flags<[DriverOption]>,
	HelpText<"Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. "
	"'all' resets the list to its default value.">;
	def emit_static_lib : Flag<["--"], "emit-static-lib">,
	HelpText<"Enable linker job to emit a static library.">;
	def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
	Alias<no_offload_arch_EQ>;
	def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
	HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
	def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
	HelpText<"Don't error out if the detected version of the CUDA install is "
	"too low for the requested CUDA gpu architecture.">;
	def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
	def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
	HelpText<"CUDA installation path">;
	def cuda_path_ignore_env : Flag<["--"], "cuda-path-ignore-env">, Group<i_Group>,
	HelpText<"Ignore environment variables to detect CUDA installation">;
	def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
	HelpText<"Path to ptxas (used for compiling CUDA code)">;
	def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
	HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
	def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
	defm cuda_approx_transcendentals : OptInFFlag<"cuda-approx-transcendentals", "Use", "Don't use",
	" approximate transcendental functions">;
	defm gpu_rdc : OptInFFlag<"gpu-rdc",
	"Generate relocatable device code, also known as separate compilation mode", "", "">;
	def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
	def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
	defm cuda_short_ptr : OptInFFlag<"cuda-short-ptr",
	"Use 32-bit pointers for accessing const/local/shared address spaces">;
	def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
	HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
	def rocm_device_lib_path_EQ : Joined<["--"], "rocm-device-lib-path=">, Group<Link_Group>,
	HelpText<"ROCm device library path. Alternative to rocm-path.">;
	def : Joined<["--"], "hip-device-lib-path=">, Alias<rocm_device_lib_path_EQ>;
	def hip_device_lib_EQ : Joined<["--"], "hip-device-lib=">, Group<Link_Group>,
	HelpText<"HIP device library">;
	def hip_version_EQ : Joined<["--"], "hip-version=">,
	HelpText<"HIP version in the format of major.minor.patch">;
	def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
	Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
	defm hip_new_launch_api : OptInFFlag<"hip-new-launch-api",
	"Use", "Don't use", " new kernel launching API for HIP">;
	defm gpu_allow_device_init : OptInFFlag<"gpu-allow-device-init",
	"Allow", "Don't allow", " device side init function in HIP">;
	def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
	Flags<[CC1Option]>,
	HelpText<"Default max threads per block for kernel launch bounds for HIP">;
	def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
	HelpText<"Path to libomptarget-nvptx libraries">;
	def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode in addition to normal output">;
	def dI : Flag<["-"], "dI">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print include directives in -E mode in addition to normal output">;
	def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode instead of normal output">;
	def dead__strip : Flag<["-"], "dead_strip">;
	def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,
	HelpText<"Filename (or -) to write dependency output to">;
	def dependency_dot : Separate<["-"], "dependency-dot">, Flags<[CC1Option]>,
	HelpText<"Filename to write DOT-formatted header dependencies to">;
	def module_dependency_dir : Separate<["-"], "module-dependency-dir">,
	Flags<[CC1Option]>, HelpText<"Directory to dump module dependencies to">;
	def dumpmachine : Flag<["-"], "dumpmachine">;
	def dumpspecs : Flag<["-"], "dumpspecs">, Flags<[Unsupported]>;
	def dumpversion : Flag<["-"], "dumpversion">;
	def dylib__file : Separate<["-"], "dylib_file">;
	def dylinker__install__name : JoinedOrSeparate<["-"], "dylinker_install_name">;
	def dylinker : Flag<["-"], "dylinker">;
	def dynamiclib : Flag<["-"], "dynamiclib">;
	def dynamic : Flag<["-"], "dynamic">, Flags<[NoArgumentUnused]>;
	def d_Flag : Flag<["-"], "d">, Group<d_Group>;
	def d_Joined : Joined<["-"], "d">, Group<d_Group>;
	def emit_ast : Flag<["-"], "emit-ast">,
	HelpText<"Emit Clang AST files for source inputs">;
	def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Use the LLVM representation for assembler and object files">;
	def emit_interface_stubs : Flag<["-"], "emit-interface-stubs">, Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Generate Inteface Stub Files.">;
	def emit_merged_ifs : Flag<["-"], "emit-merged-ifs">,
	Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Generate Interface Stub Files, emit merged text not binary.">;
	def interface_stub_version_EQ : JoinedOrSeparate<["-"], "interface-stub-version=">, Flags<[CC1Option]>;
	def exported__symbols__list : Separate<["-"], "exported_symbols_list">;
	def e : JoinedOrSeparate<["-"], "e">, Group<Link_Group>;
	def fmax_tokens_EQ : Joined<["-"], "fmax-tokens=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Max total number of preprocessed tokens for -Wmax-tokens.">;
	def fPIC : Flag<["-"], "fPIC">, Group<f_Group>;
	def fno_PIC : Flag<["-"], "fno-PIC">, Group<f_Group>;
	def fPIE : Flag<["-"], "fPIE">, Group<f_Group>;
	def fno_PIE : Flag<["-"], "fno-PIE">, Group<f_Group>;
	defm access_control : OptOutFFlag<"no-access-control", "", "Disable C++ access control">;
	def falign_functions : Flag<["-"], "falign-functions">, Group<f_Group>;
	def falign_functions_EQ : Joined<["-"], "falign-functions=">, Group<f_Group>;
	def fno_align_functions: Flag<["-"], "fno-align-functions">, Group<f_Group>;
	defm allow_editor_placeholders : OptInFFlag<"allow-editor-placeholders", "Treat editor placeholders as valid source code">;
	def fallow_unsupported : Flag<["-"], "fallow-unsupported">, Group<f_Group>;
	def fapple_kext : Flag<["-"], "fapple-kext">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use Apple's kernel extensions ABI">;
	def fapple_pragma_pack : Flag<["-"], "fapple-pragma-pack">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable Apple gcc-compatible #pragma pack handling">;
	def shared_libsan : Flag<["-"], "shared-libsan">,
	HelpText<"Dynamically link the sanitizer runtime">;
	def static_libsan : Flag<["-"], "static-libsan">,
	HelpText<"Statically link the sanitizer runtime">;
	def : Flag<["-"], "shared-libasan">, Alias<shared_libsan>;
	def fasm : Flag<["-"], "fasm">, Group<f_Group>;

	defm asm_blocks : OptInFFlag<"asm-blocks", "">;

	def fassume_sane_operator_new : Flag<["-"], "fassume-sane-operator-new">, Group<f_Group>;
	def fastcp : Flag<["-"], "fastcp">, Group<f_Group>;
	def fastf : Flag<["-"], "fastf">, Group<f_Group>;
	def fast : Flag<["-"], "fast">, Group<f_Group>;
	def fasynchronous_unwind_tables : Flag<["-"], "fasynchronous-unwind-tables">, Group<f_Group>;

	def fdouble_square_bracket_attributes : Flag<[ "-" ], "fdouble-square-bracket-attributes">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable '[[]]' attributes in all C and C++ language modes">;
	def fno_double_square_bracket_attributes : Flag<[ "-" ], "fno-double-square-bracket-attributes">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable '[[]]' attributes in all C and C++ language modes">;

	defm autolink : OptOutFFlag<"autolink", "", "Disable generation of linker directives for automatic library linking">;

	// C++ Coroutines TS
	defm coroutines_ts : OptInFFlag<"coroutines-ts", "Enable support for the C++ Coroutines TS">;

	def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option, CC1AsOption]>, MetaVarName<"<option>">,
	HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">;
	def fembed_bitcode : Flag<["-"], "fembed-bitcode">, Group<f_Group>,
	Alias<fembed_bitcode_EQ>, AliasArgs<["all"]>,
	HelpText<"Embed LLVM IR bitcode as data">;
	def fembed_bitcode_marker : Flag<["-"], "fembed-bitcode-marker">,
	Alias<fembed_bitcode_EQ>, AliasArgs<["marker"]>,
	HelpText<"Embed placeholder LLVM IR data as a marker">;
	defm gnu_inline_asm : OptOutFFlag<"gnu-inline-asm", "", "Disable GNU style inline asm">;

	def fprofile_sample_use : Flag<["-"], "fprofile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_profile_sample_use : Flag<["-"], "fno-profile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_sample_use_EQ : Joined<["-"], "fprofile-sample-use=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable sample-based profile guided optimizations">;
	def fprofile_sample_accurate : Flag<["-"], "fprofile-sample-accurate">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specifies that the sample profile is accurate">,
	DocBrief<[{Specifies that the sample profile is accurate. If the sample
	profile is accurate, callsites without profile samples are marked
	as cold. Otherwise, treat callsites without profile samples as if
	we have no profile}]>;
	def fno_profile_sample_accurate : Flag<["-"], "fno-profile-sample-accurate">,
	Group<f_Group>, Flags<[DriverOption]>;
	def fauto_profile : Flag<["-"], "fauto-profile">, Group<f_Group>,
	Alias<fprofile_sample_use>;
	def fno_auto_profile : Flag<["-"], "fno-auto-profile">, Group<f_Group>,
	Alias<fno_profile_sample_use>;
	def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
	Alias<fprofile_sample_use_EQ>;
	def fauto_profile_accurate : Flag<["-"], "fauto-profile-accurate">,
	Group<f_Group>, Alias<fprofile_sample_accurate>;
	def fno_auto_profile_accurate : Flag<["-"], "fno-auto-profile-accurate">,
	Group<f_Group>, Alias<fno_profile_sample_accurate>;
	def fdebug_compilation_dir : Separate<["-"], "fdebug-compilation-dir">,
	Group<f_Group>, Flags<[CC1Option, CC1AsOption, CoreOption]>,
	HelpText<"The compilation directory to embed in the debug info.">;
	def fdebug_compilation_dir_EQ : Joined<["-"], "fdebug-compilation-dir=">,
	Group<f_Group>, Flags<[CC1Option, CC1AsOption, CoreOption]>,
	Alias<fdebug_compilation_dir>;
	defm debug_info_for_profiling : OptInFFlag<"debug-info-for-profiling",
	"Emit extra debug info to make sample profile more accurate">;
	def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<file>">,
	HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Use instrumentation data for profile-guided optimization">;
	def fprofile_remapping_file_EQ : Joined<["-"], "fprofile-remapping-file=">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>, MetaVarName<"<file>">,
	HelpText<"Use the remappings described in <file> to match the profile data against names in the program">;
	def fprofile_remapping_file : Separate<["-"], "fprofile-remapping-file">,
	Group<f_Group>, Flags<[CoreOption]>, Alias<fprofile_remapping_file_EQ>;
	defm coverage_mapping : OptInFFlag<"coverage-mapping",
	"Generate coverage mapping to enable code coverage analysis", "Disable code coverage analysis", "",
	[CoreOption]>;
	def fprofile_generate : Flag<["-"], "fprofile-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<directory>">,
	HelpText<"Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fcs_profile_generate : Flag<["-"], "fcs-profile-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect context sensitive execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fcs_profile_generate_EQ : Joined<["-"], "fcs-profile-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<directory>">,
	HelpText<"Generate instrumented code to collect context sensitive execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
	Alias<fprofile_instr_use>;
	def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
	Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<pathname>">,
	HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
	def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_generate : Flag<["-"], "fno-profile-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_instr_use : Flag<["-"], "fno-profile-instr-use">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Disable using instrumentation data for profile-guided optimization">;
	def fno_profile_use : Flag<["-"], "fno-profile-use">,
	Alias<fno_profile_instr_use>;
	def fprofile_filter_files_EQ : Joined<["-"], "fprofile-filter-files=">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Instrument only functions from files where names match any regex separated by a semi-colon">;
	def fprofile_exclude_files_EQ : Joined<["-"], "fprofile-exclude-files=">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Instrument only functions from files where names don't match all the regexes separated by a semi-colon">;
	def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">,
	Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;

	defm addrsig : OptInFFlag<"addrsig", "Emit", "Don't emit", " an address-significance table", [CoreOption]>;
	defm blocks : OptInFFlag<"blocks", "Enable the 'blocks' language feature", "", "", [CoreOption]>;
	def fbootclasspath_EQ : Joined<["-"], "fbootclasspath=">, Group<f_Group>;
	def fborland_extensions : Flag<["-"], "fborland-extensions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Accept non-standard constructs supported by the Borland compiler">;
	def fbuiltin : Flag<["-"], "fbuiltin">, Group<f_Group>, Flags<[CoreOption]>;
	def fbuiltin_module_map : Flag <["-"], "fbuiltin-module-map">, Group<f_Group>,
	Flags<[DriverOption]>, HelpText<"Load the clang builtins module map file.">;
	defm caret_diagnostics : OptOutFFlag<"caret-diagnostics", "", "">;
	def fclang_abi_compat_EQ : Joined<["-"], "fclang-abi-compat=">, Group<f_clang_Group>,
	Flags<[CC1Option]>, MetaVarName<"<version>">, Values<"<major>.<minor>,latest">,
	HelpText<"Attempt to match the ABI of Clang <version>">;
	def fclasspath_EQ : Joined<["-"], "fclasspath=">, Group<f_Group>;
	defm color_diagnostics : OptInFFlag<"color-diagnostics", "Enable", "Disable", " colors in diagnostics", [CoreOption]>;
	def fdiagnostics_color : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
	def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use ANSI escape codes for diagnostics">,
	MarshallingInfoFlag<"DiagnosticOpts->UseANSIEscapeCodes", "false">;
	def fcomment_block_commands : CommaJoined<["-"], "fcomment-block-commands=">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Treat each comma separated argument in <arg> as a documentation comment block command">,
	MetaVarName<"<arg>">;
	def fparse_all_comments : Flag<["-"], "fparse-all-comments">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def frecord_command_line : Flag<["-"], "frecord-command-line">,
	Group<f_clang_Group>;
	def fno_record_command_line : Flag<["-"], "fno-record-command-line">,
	Group<f_clang_Group>;
	def : Flag<["-"], "frecord-gcc-switches">, Alias<frecord_command_line>;
	def : Flag<["-"], "fno-record-gcc-switches">, Alias<fno_record_command_line>;
	def fcommon : Flag<["-"], "fcommon">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Place uninitialized global variables in a common block">;
	def fcompile_resource_EQ : Joined<["-"], "fcompile-resource=">, Group<f_Group>;
	def fcomplete_member_pointers : Flag<["-"], "fcomplete-member-pointers">, Group<f_clang_Group>,
	Flags<[CoreOption, CC1Option]>,
	HelpText<"Require member pointer base types to be complete if they would be significant under the Microsoft ABI">;
	def fno_complete_member_pointers : Flag<["-"], "fno-complete-member-pointers">, Group<f_clang_Group>,
	Flags<[CoreOption]>,
	HelpText<"Do not require member pointer base types to be complete if they would be significant under the Microsoft ABI">;
	def fcf_runtime_abi_EQ : Joined<["-"], "fcf-runtime-abi=">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fconstant_cfstrings : Flag<["-"], "fconstant-cfstrings">, Group<f_Group>;
	def fconstant_string_class_EQ : Joined<["-"], "fconstant-string-class=">, Group<f_Group>;
	def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
	def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
	def fexperimental_new_constant_interpreter : Flag<["-"], "fexperimental-new-constant-interpreter">, Group<f_Group>,
	HelpText<"Enable the experimental new constant interpreter">, Flags<[CC1Option]>;
	def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
	Group<f_Group>;
	def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused, CoreOption]>,
	HelpText<"Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash">;
	def fcrash_diagnostics_dir : Joined<["-"], "fcrash-diagnostics-dir=">, Group<f_clang_Group>, Flags<[NoArgumentUnused, CoreOption]>;
	def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
	defm cxx_exceptions: OptInFFlag<"cxx-exceptions", "Enable C++ exceptions">;
	def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fdebug_pass_arguments : Flag<["-"], "fdebug-pass-arguments">, Group<f_Group>;
	def fdebug_pass_structure : Flag<["-"], "fdebug-pass-structure">, Group<f_Group>;
	def fdepfile_entry : Joined<["-"], "fdepfile-entry=">,
	Group<f_clang_Group>, Flags<[CC1Option]>;
	def fdiagnostics_fixit_info : Flag<["-"], "fdiagnostics-fixit-info">, Group<f_clang_Group>;
	def fdiagnostics_parseable_fixits : Flag<["-"], "fdiagnostics-parseable-fixits">, Group<f_clang_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Print fix-its in machine parseable form">;
	def fdiagnostics_print_source_range_info : Flag<["-"], "fdiagnostics-print-source-range-info">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Print source range spans in numeric form">;
	def fdiagnostics_show_hotness : Flag<["-"], "fdiagnostics-show-hotness">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable profile hotness information in diagnostic line">;
	def fdiagnostics_hotness_threshold_EQ : Joined<["-"], "fdiagnostics-hotness-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
	HelpText<"Prevent optimization remarks from being output if they do not have at least this profile count">;
	def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<f_Group>,
	HelpText<"Print option name with mappable diagnostics">;
	def fdiagnostics_show_note_include_stack : Flag<["-"], "fdiagnostics-show-note-include-stack">,
	Group<f_Group>, Flags<[CC1Option]>, HelpText<"Display include stacks for diagnostic notes">;
	def fdiagnostics_format_EQ : Joined<["-"], "fdiagnostics-format=">, Group<f_clang_Group>;
	def fdiagnostics_show_category_EQ : Joined<["-"], "fdiagnostics-show-category=">, Group<f_clang_Group>;
	def fdiagnostics_show_template_tree : Flag<["-"], "fdiagnostics-show-template-tree">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Print a template comparison tree for differing templates">;
	def fdeclspec : Flag<["-"], "fdeclspec">, Group<f_clang_Group>,
	HelpText<"Allow __declspec as a keyword">, Flags<[CC1Option]>;
	def fdiscard_value_names : Flag<["-"], "fdiscard-value-names">, Group<f_clang_Group>,
	HelpText<"Discard value names in LLVM IR">, Flags<[DriverOption]>;
	def fno_discard_value_names : Flag<["-"], "fno-discard-value-names">, Group<f_clang_Group>,
	HelpText<"Do not discard value names in LLVM IR">, Flags<[DriverOption]>;
	def fdollars_in_identifiers : Flag<["-"], "fdollars-in-identifiers">, Group<f_Group>,
	HelpText<"Allow '$' in identifiers">, Flags<[CC1Option]>;
	def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	defm dwarf_directory_asm : OptOutFFlag<"dwarf-directory-asm", "", "">;
	def felide_constructors : Flag<["-"], "felide-constructors">, Group<f_Group>;
	def fno_elide_type : Flag<["-"], "fno-elide-type">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Do not elide types when printing diagnostics">;
	def feliminate_unused_debug_symbols : Flag<["-"], "feliminate-unused-debug-symbols">, Group<f_Group>;
	def femit_all_decls : Flag<["-"], "femit-all-decls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Emit all declarations, even if unused">;
	def femulated_tls : Flag<["-"], "femulated-tls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use emutls functions to access thread_local variables">;
	def fno_emulated_tls : Flag<["-"], "fno-emulated-tls">, Group<f_Group>, Flags<[CC1Option]>;
	def fencoding_EQ : Joined<["-"], "fencoding=">, Group<f_Group>;
	def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<f_Group>, Flags<[CoreOption]>;
	defm exceptions : OptInFFlag<"exceptions", "Enable", "Disable", " support for exception handling">;
	def fdwarf_exceptions : Flag<["-"], "fdwarf-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use DWARF style exceptions">;
	def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SjLj style exceptions">;
	def fseh_exceptions : Flag<["-"], "fseh-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SEH style exceptions">;
	def fwasm_exceptions : Flag<["-"], "fwasm-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use WebAssembly style exceptions">;
	def fignore_exceptions : Flag<["-"], "fignore-exceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable support for ignoring exception handling constructs">;
	def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-expensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def fextdirs_EQ : Joined<["-"], "fextdirs=">, Group<f_Group>;
	def : Flag<["-"], "fdefer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-defer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
	def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
	def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
	def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
	def ffp_model_EQ : Joined<["-"], "ffp-model=">, Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Controls the semantics of floating-point calculations.">;
	def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specifies the exception behavior of floating-point operations.">;
	defm fast_math : OptInFFlag<"fast-math", "Allow aggressive, lossy floating-point optimizations">;
	defm math_errno : OptInFFlag<"math-errno", "Require math functions to indicate errors by setting errno">;
	def fbracket_depth_EQ : Joined<["-"], "fbracket-depth=">, Group<f_Group>, Flags<[CoreOption]>;
	def fsignaling_math : Flag<["-"], "fsignaling-math">, Group<f_Group>;
	def fno_signaling_math : Flag<["-"], "fno-signaling-math">, Group<f_Group>;
	defm jump_tables : OptOutFFlag<"jump-tables", "Use", "Do not use", " jump tables for lowering switches">;
	defm force_enable_int128 : OptInFFlag<"force-enable-int128", "Enable", "Disable", " support for int128_t type">;
	defm keep_static_consts : OptInFFlag<"keep-static-consts", "Keep", "Don't keep", " static const variables if unused", [DriverOption]>;
	defm fixed_point : OptInFFlag<"fixed-point", "Enable", "Disable", " fixed point types">;
	defm cxx_static_destructors : OptOutFFlag<"c++-static-destructors", "",
	"Disable C++ static destructor registration">;
	def fsymbol_partition_EQ : Joined<["-"], "fsymbol-partition=">, Group<f_Group>,
	Flags<[CC1Option]>;

	// Begin sanitizer flags. These should all be core options exposed in all driver
	// modes.
	let Flags = [CC1Option, CoreOption] in {

	def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group<f_clang_Group>,
	MetaVarName<"<check>">,
	HelpText<"Turn on runtime checks for various forms of undefined "
	"or suspicious behavior. See user manual for available checks">;
	def fno_sanitize_EQ : CommaJoined<["-"], "fno-sanitize=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fsanitize_blacklist : Joined<["-"], "fsanitize-blacklist=">,
	Group<f_clang_Group>,
	HelpText<"Path to blacklist file for sanitizers">;
	def fsanitize_system_blacklist : Joined<["-"], "fsanitize-system-blacklist=">,
	HelpText<"Path to system blacklist file for sanitizers">,
	Flags<[CC1Option]>;
	def fno_sanitize_blacklist : Flag<["-"], "fno-sanitize-blacklist">,
	Group<f_clang_Group>,
	HelpText<"Don't use blacklist file for sanitizers">;
	def fsanitize_coverage
	: CommaJoined<["-"], "fsanitize-coverage=">,
	Group<f_clang_Group>,
	HelpText<"Specify the type of coverage instrumentation for Sanitizers">;
	def fno_sanitize_coverage
	: CommaJoined<["-"], "fno-sanitize-coverage=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable specified features of coverage instrumentation for "
	"Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep,8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters,inline-bool-flag">;
	def fsanitize_coverage_allowlist : Joined<["-"], "fsanitize-coverage-allowlist=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Restrict sanitizer coverage instrumentation exclusively to modules and functions that match the provided special case list, except the blocked ones">;
	def : Joined<["-"], "fsanitize-coverage-whitelist=">,
	Group<f_clang_Group>, Flags<[CoreOption, HelpHidden]>, Alias<fsanitize_coverage_allowlist>,
	HelpText<"Deprecated, use -fsanitize-coverage-allowlist= instead">;
	def fsanitize_coverage_blocklist : Joined<["-"], "fsanitize-coverage-blocklist=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable sanitizer coverage instrumentation for modules and functions that match the provided special case list, even the allowed ones">;
	def : Joined<["-"], "fsanitize-coverage-blacklist=">,
	Group<f_clang_Group>, Flags<[CoreOption, HelpHidden]>, Alias<fsanitize_coverage_blocklist>,
	HelpText<"Deprecated, use -fsanitize-coverage-blocklist= instead">;
	def fsanitize_memory_track_origins_EQ : Joined<["-"], "fsanitize-memory-track-origins=">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fsanitize_memory_track_origins : Flag<["-"], "fsanitize-memory-track-origins">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fno_sanitize_memory_track_origins : Flag<["-"], "fno-sanitize-memory-track-origins">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable origins tracking in MemorySanitizer">;
	def fsanitize_memory_use_after_dtor : Flag<["-"], "fsanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-destroy detection in MemorySanitizer">;
	def fno_sanitize_memory_use_after_dtor : Flag<["-"], "fno-sanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Disable use-after-destroy detection in MemorySanitizer">;
	def fsanitize_address_field_padding : Joined<["-"], "fsanitize-address-field-padding=">,
	Group<f_clang_Group>,
	HelpText<"Level of field padding for AddressSanitizer">;
	def fsanitize_address_use_after_scope : Flag<["-"], "fsanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-scope detection in AddressSanitizer">;
	def fno_sanitize_address_use_after_scope : Flag<["-"], "fno-sanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable use-after-scope detection in AddressSanitizer">;
	def fsanitize_address_poison_custom_array_cookie
	: Flag<[ "-" ], "fsanitize-address-poison-custom-array-cookie">,
	Group<f_clang_Group>,
	HelpText<"Enable poisoning array cookies when using custom operator new[] in AddressSanitizer">;
	def fno_sanitize_address_poison_custom_array_cookie
	: Flag<[ "-" ], "fno-sanitize-address-poison-custom-array-cookie">,
	Group<f_clang_Group>,
	HelpText<"Disable poisoning array cookies when using custom operator new[] in AddressSanitizer">;
	def fsanitize_address_globals_dead_stripping : Flag<["-"], "fsanitize-address-globals-dead-stripping">,
	Group<f_clang_Group>,
	HelpText<"Enable linker dead stripping of globals in AddressSanitizer">;
	def fsanitize_address_use_odr_indicator
	: Flag<["-"], "fsanitize-address-use-odr-indicator">,
	Group<f_clang_Group>,
	HelpText<"Enable ODR indicator globals to avoid false ODR violation reports in partially sanitized programs at the cost of an increase in binary size">;
	def fno_sanitize_address_use_odr_indicator
	: Flag<["-"], "fno-sanitize-address-use-odr-indicator">,
	Group<f_clang_Group>,
	HelpText<"Disable ODR indicator globals">;
	// Note: This flag was introduced when it was necessary to distinguish between
	// ABI for correct codegen. This is no longer needed, but the flag is
	// not removed since targeting either ABI will behave the same.
	// This way we cause no disturbance to existing scripts & code, and if we
	// want to use this flag in the future we will cause no disturbance then
	// either.
	def fsanitize_hwaddress_abi_EQ
	: Joined<["-"], "fsanitize-hwaddress-abi=">,
	Group<f_clang_Group>,
	HelpText<"Select the HWAddressSanitizer ABI to target (interceptor or platform, default interceptor). This option is currently unused.">;
	def fsanitize_recover_EQ : CommaJoined<["-"], "fsanitize-recover=">,
	Group<f_clang_Group>,
	HelpText<"Enable recovery for specified sanitizers">;
	def fno_sanitize_recover_EQ : CommaJoined<["-"], "fno-sanitize-recover=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable recovery for specified sanitizers">;
	def fsanitize_recover : Flag<["-"], "fsanitize-recover">, Group<f_clang_Group>,
	Alias<fsanitize_recover_EQ>, AliasArgs<["all"]>;
	def fno_sanitize_recover : Flag<["-"], "fno-sanitize-recover">,
	Flags<[CoreOption, DriverOption]>, Group<f_clang_Group>,
	Alias<fno_sanitize_recover_EQ>, AliasArgs<["all"]>;
	def fsanitize_trap_EQ : CommaJoined<["-"], "fsanitize-trap=">, Group<f_clang_Group>,
	HelpText<"Enable trapping for specified sanitizers">;
	def fno_sanitize_trap_EQ : CommaJoined<["-"], "fno-sanitize-trap=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable trapping for specified sanitizers">;
	def fsanitize_trap : Flag<["-"], "fsanitize-trap">, Group<f_clang_Group>,
	Alias<fsanitize_trap_EQ>, AliasArgs<["all"]>,
	HelpText<"Enable trapping for all sanitizers">;
	def fno_sanitize_trap : Flag<["-"], "fno-sanitize-trap">, Group<f_clang_Group>,
	Alias<fno_sanitize_trap_EQ>, AliasArgs<["all"]>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable trapping for all sanitizers">;
	def fsanitize_undefined_trap_on_error
	: Flag<["-"], "fsanitize-undefined-trap-on-error">, Group<f_clang_Group>,
	Alias<fsanitize_trap_EQ>, AliasArgs<["undefined"]>;
	def fno_sanitize_undefined_trap_on_error
	: Flag<["-"], "fno-sanitize-undefined-trap-on-error">, Group<f_clang_Group>,
	Alias<fno_sanitize_trap_EQ>, AliasArgs<["undefined"]>;
	def fsanitize_minimal_runtime : Flag<["-"], "fsanitize-minimal-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_minimal_runtime : Flag<["-"], "fno-sanitize-minimal-runtime">,
	Group<f_clang_Group>;
	def fsanitize_link_runtime : Flag<["-"], "fsanitize-link-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_link_runtime : Flag<["-"], "fno-sanitize-link-runtime">,
	Group<f_clang_Group>;
	def fsanitize_link_cxx_runtime : Flag<["-"], "fsanitize-link-c++-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_link_cxx_runtime : Flag<["-"], "fno-sanitize-link-c++-runtime">,
	Group<f_clang_Group>;
	def fsanitize_cfi_cross_dso : Flag<["-"], "fsanitize-cfi-cross-dso">,
	Group<f_clang_Group>,
	HelpText<"Enable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fno_sanitize_cfi_cross_dso : Flag<["-"], "fno-sanitize-cfi-cross-dso">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>,
	HelpText<"Disable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fsanitize_cfi_icall_generalize_pointers : Flag<["-"], "fsanitize-cfi-icall-generalize-pointers">,
	Group<f_clang_Group>,
	HelpText<"Generalize pointers in CFI indirect call type signature checks">;
	def fsanitize_cfi_canonical_jump_tables : Flag<["-"], "fsanitize-cfi-canonical-jump-tables">,
	Group<f_clang_Group>,
	HelpText<"Make the jump table addresses canonical in the symbol table">;
	def fno_sanitize_cfi_canonical_jump_tables : Flag<["-"], "fno-sanitize-cfi-canonical-jump-tables">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Do not make the jump table addresses canonical in the symbol table">;
	def fsanitize_stats : Flag<["-"], "fsanitize-stats">,
	Group<f_clang_Group>,
	HelpText<"Enable sanitizer statistics gathering.">;
	def fno_sanitize_stats : Flag<["-"], "fno-sanitize-stats">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable sanitizer statistics gathering.">;
	def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">,
	Group<f_clang_Group>,
	HelpText<"Enable memory access instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_memory_access : Flag<["-"], "fno-sanitize-thread-memory-access">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable memory access instrumentation in ThreadSanitizer">;
	def fsanitize_thread_func_entry_exit : Flag<["-"], "fsanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	HelpText<"Enable function entry/exit instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_func_entry_exit : Flag<["-"], "fno-sanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable function entry/exit instrumentation in ThreadSanitizer">;
	def fsanitize_thread_atomics : Flag<["-"], "fsanitize-thread-atomics">,
	Group<f_clang_Group>,
	HelpText<"Enable atomic operations instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_atomics : Flag<["-"], "fno-sanitize-thread-atomics">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable atomic operations instrumentation in ThreadSanitizer">;
	def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-undefined-strip-path-components=">,
	Group<f_clang_Group>, MetaVarName<"<number>">,
	HelpText<"Strip (or keep only, if negative) a given number of path components "
	"when emitting check metadata.">;

	} // end -f[no-]sanitize* flags

	def funsafe_math_optimizations : Flag<["-"], "funsafe-math-optimizations">,
	Group<f_Group>;
	def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">,
	Group<f_Group>;
	def fassociative_math : Flag<["-"], "fassociative-math">, Group<f_Group>;
	def fno_associative_math : Flag<["-"], "fno-associative-math">, Group<f_Group>;
	def freciprocal_math :
	Flag<["-"], "freciprocal-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow division operations to be reassociated">;
	def fno_reciprocal_math : Flag<["-"], "fno-reciprocal-math">, Group<f_Group>;
	def ffinite_math_only : Flag<["-"], "ffinite-math-only">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_finite_math_only : Flag<["-"], "fno-finite-math-only">, Group<f_Group>;
	def fsigned_zeros : Flag<["-"], "fsigned-zeros">, Group<f_Group>;
	def fno_signed_zeros :
	Flag<["-"], "fno-signed-zeros">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
	def fhonor_nans : Flag<["-"], "fhonor-nans">, Group<f_Group>;
	def fno_honor_nans : Flag<["-"], "fno-honor-nans">, Group<f_Group>;
	def fhonor_infinities : Flag<["-"], "fhonor-infinities">, Group<f_Group>;
	def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<f_Group>;
	// This option was originally misspelt "infinites" [sic].
	def : Flag<["-"], "fhonor-infinites">, Alias<fhonor_infinities>;
	def : Flag<["-"], "fno-honor-infinites">, Alias<fno_honor_infinities>;
	def frounding_math : Flag<["-"], "frounding-math">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_rounding_math : Flag<["-"], "fno-rounding-math">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
	" \| on (according to FP_CONTRACT pragma) \| off (never fuse). Default"
	" is 'fast' for CUDA/HIP and 'on' otherwise.">, Values<"fast,on,off">;

	defm strict_float_cast_overflow : OptOutFFlag<"strict-float-cast-overflow",
	"Assume that overflowing float-to-int casts are undefined (default)",
	"Relax language rules and try to match the behavior of the target's native float-to-int conversion instructions">;

	def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
	def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;

	defm rewrite_imports : OptInFFlag<"rewrite-imports", "">;
	defm rewrite_includes : OptInFFlag<"rewrite-includes", "">;

	defm delete_null_pointer_checks : OptOutFFlag<"delete-null-pointer-checks",
	"Treat usage of null pointers as undefined behavior (default)",
	"Do not treat usage of null pointers as undefined behavior">;

	def frewrite_map_file : Separate<["-"], "frewrite-map-file">,
	Group<f_Group>,
	Flags<[ DriverOption, CC1Option ]>;
	def frewrite_map_file_EQ : Joined<["-"], "frewrite-map-file=">,
	Group<f_Group>,
	Flags<[DriverOption]>;

	defm use_line_directives : OptInFFlag<"use-line-directives", "Use #line in preprocessed output">;

	def ffreestanding : Flag<["-"], "ffreestanding">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assert that the compilation takes place in a freestanding environment">;
	def fgnuc_version_EQ : Joined<["-"], "fgnuc-version=">, Group<f_Group>,
	HelpText<"Sets various macros to claim compatibility with the given GCC version (default is 4.2.1)">,
	Flags<[CC1Option, CoreOption]>;
	def fgnu_keywords : Flag<["-"], "fgnu-keywords">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow GNU-extension keywords regardless of language standard">;
	defm gnu89_inline : OptInFFlag<"gnu89-inline", "Use the gnu89 inline semantics">;
	def fgnu_runtime : Flag<["-"], "fgnu-runtime">, Group<f_Group>,
	HelpText<"Generate output compatible with the standard GNU Objective-C runtime">;
	def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, Flags<[CC1Option]>;
	def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>,
	Group<Link_Group>;
	def : Flag<["-"], "findirect-virtual-calls">, Alias<fapple_kext>;
	def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline suitable functions">;
	def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline functions which are (explicitly or implicitly) marked inline">;
	def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
	def fglobal_isel : Flag<["-"], "fglobal-isel">, Group<f_clang_Group>,
	HelpText<"Enables the global instruction selector">;
	def fexperimental_isel : Flag<["-"], "fexperimental-isel">, Group<f_clang_Group>,
	Alias<fglobal_isel>;
	def fexperimental_new_pass_manager : Flag<["-"], "fexperimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enables an experimental new pass manager in LLVM.">;
	def fexperimental_strict_floating_point : Flag<["-"], "fexperimental-strict-floating-point">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enables experimental strict floating point in LLVM.">;
	def finput_charset_EQ : Joined<["-"], "finput-charset=">, Group<f_Group>;
	def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
	def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Generate calls to instrument function entry and exit">;
	def finstrument_functions_after_inlining : Flag<["-"], "finstrument-functions-after-inlining">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Like -finstrument-functions, but insert the calls after inlining">;
	def finstrument_function_entry_bare : Flag<["-"], "finstrument-function-entry-bare">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Instrument function entry only, after inlining, without arguments to the instrumentation call">;
	def fcf_protection_EQ : Joined<["-"], "fcf-protection=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Instrument control-flow architecture protection. Options: return, branch, full, none.">, Values<"return,branch,full,none">;
	def fcf_protection : Flag<["-"], "fcf-protection">, Group<f_Group>, Flags<[CoreOption, CC1Option]>,
	Alias<fcf_protection_EQ>, AliasArgs<["full"]>,
	HelpText<"Enable cf-protection in 'full' mode">;

	defm xray_instrument : OptInFFlag<"xray-instrument", "Generate XRay instrumentation sleds on function entry and exit">;

	def fxray_instruction_threshold_EQ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Sets the minimum function size to instrument with XRay">;
	def fxray_instruction_threshold_ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fxray_always_instrument :
	JoinedOrSeparate<["-"], "fxray-always-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"DEPRECATED: Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.">;
	def fxray_never_instrument :
	JoinedOrSeparate<["-"], "fxray-never-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"DEPRECATED: Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.">;
	def fxray_attr_list :
	JoinedOrSeparate<["-"], "fxray-attr-list=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Filename defining the list of functions/types for imbuing XRay attributes.">;
	def fxray_modes :
	JoinedOrSeparate<["-"], "fxray-modes=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"List of modes to link in by default into XRay instrumented binaries.">;

	defm xray_always_emit_customevents : OptInFFlag<"xray-always-emit-customevents",
	"Always emit __xray_customevent(...) calls even if the containing function is not always instrumented">;

	defm xray_always_emit_typedevents : OptInFFlag<"xray-always-emit-typedevents",
	"Always emit __xray_typedevent(...) calls even if the containing function is not always instrumented">;

	defm xray_ignore_loops : OptInFFlag<"xray-ignore-loops",
	"Don't instrument functions with loops unless they also meet the minimum function size">;
	defm xray_function_index : OptOutFFlag<"xray-function-index", "",
	"Omit function index section at the expense of single-function patching performance">;

	def fxray_link_deps : Flag<["-"], "fxray-link-deps">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Tells clang to add the link dependencies for XRay.">;
	def fnoxray_link_deps : Flag<["-"], "fnoxray-link-deps">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_instrumentation_bundle :
	JoinedOrSeparate<["-"], "fxray-instrumentation-bundle=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Select which XRay instrumentation points to emit. Options: all, none, function-entry, function-exit, function, custom. Default is 'all'. 'function' includes both 'function-entry' and 'function-exit'.">;

	def ffine_grained_bitfield_accesses : Flag<["-"],
	"ffine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Use separate accesses for consecutive bitfield runs with legal widths and alignments.">;
	def fno_fine_grained_bitfield_accesses : Flag<["-"],
	"fno-fine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Use large-integer access for consecutive bitfield runs.">;

	def fexperimental_relative_cxx_abi_vtables : Flag<["-"], "fexperimental-relative-c++-abi-vtables">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the experimental C++ class ABI for classes with virtual tables">;
	def fno_experimental_relative_cxx_abi_vtables : Flag<["-"], "fno-experimental-relative-c++-abi-vtables">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not use the experimental C++ class ABI for classes with virtual tables">;

	def flat__namespace : Flag<["-"], "flat_namespace">;
	def flax_vector_conversions_EQ : Joined<["-"], "flax-vector-conversions=">, Group<f_Group>,
	HelpText<"Enable implicit vector bit-casts">, Values<"none,integer,all">, Flags<[CC1Option]>;
	def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>,
	Alias<flax_vector_conversions_EQ>, AliasArgs<["integer"]>;
	def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
	def fapple_link_rtlib : Flag<["-"], "fapple-link-rtlib">, Group<f_Group>,
	HelpText<"Force linking the clang builtins runtime library">;
	def flto_EQ : Joined<["-"], "flto=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Set LTO mode to either 'full' or 'thin'">, Values<"thin,full">;
	def flto : Flag<["-"], "flto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Enable LTO in 'full' mode">;
	def fno_lto : Flag<["-"], "fno-lto">, Group<f_Group>,
	HelpText<"Disable LTO mode (default)">;
	def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
	Flags<[CC1Option]>, Group<f_Group>,
	HelpText<"Controls the backend parallelism of -flto=thin (default "
	"of 0 means the number of threads will be derived from "
	"the number of CPUs detected)">;
	def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
	Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Perform ThinLTO importing using provided function summary index">;
	def fthin_link_bitcode_EQ : Joined<["-"], "fthin-link-bitcode=">,
	Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Write minimized bitcode to <file> for the ThinLTO thin link only">;
	def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
	Group<f_Group>, Flags<[DriverOption, CoreOption]>;
	def fmerge_all_constants : Flag<["-"], "fmerge-all-constants">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Allow merging of constants">;
	def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Format message diagnostics so that they fit within N columns">;
	def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">;
	def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Enable full Microsoft Visual C++ compatibility">;
	def fms_volatile : Flag<["-"], "fms-volatile">, Group<f_Group>, Flags<[CC1Option]>;
	def fmsc_version : Joined<["-"], "fmsc-version=">, Group<f_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))">;
	def fms_compatibility_version
	: Joined<["-"], "fms-compatibility-version=">,
	Group<f_Group>,
	Flags<[ CC1Option, CoreOption ]>,
	HelpText<"Dot-separated value representing the Microsoft compiler "
	"version number to report in _MSC_VER (0 = don't define it "
	"(default))">;
	def fdelayed_template_parsing : Flag<["-"], "fdelayed-template-parsing">, Group<f_Group>,
	HelpText<"Parse templated function definitions at the end of the "
	"translation unit">, Flags<[CC1Option, CoreOption]>;
	def fms_memptr_rep_EQ : Joined<["-"], "fms-memptr-rep=">, Group<f_Group>, Flags<[CC1Option]>;
	def fmodules_cache_path : Joined<["-"], "fmodules-cache-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module cache path">;
	def fmodules_user_build_path : Separate<["-"], "fmodules-user-build-path">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module user build path">;
	def fprebuilt_module_path : Joined<["-"], "fprebuilt-module-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the prebuilt module path">;
	def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">;
	def fmodules_prune_after : Joined<["-"], "fmodules-prune-after=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) after which a module file will be considered unused">;
	def fmodules_search_all : Flag <["-"], "fmodules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Search even non-imported modules to resolve references">;
	def fbuild_session_timestamp : Joined<["-"], "fbuild-session-timestamp=">,
	Group<i_Group>, Flags<[CC1Option]>, MetaVarName<"<time since Epoch in seconds>">,
	HelpText<"Time when the current build session started">;
	def fbuild_session_file : Joined<["-"], "fbuild-session-file=">,
	Group<i_Group>, MetaVarName<"<file>">,
	HelpText<"Use the last modification time of <file> as the build session timestamp">;
	def fmodules_validate_once_per_build_session : Flag<["-"], "fmodules-validate-once-per-build-session">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Don't verify input files for the modules if the module has been "
	"successfully validated or loaded during this build session">;
	def fmodules_disable_diagnostic_validation : Flag<["-"], "fmodules-disable-diagnostic-validation">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Disable validation of the diagnostic options when loading the module">;
	def fmodules_validate_system_headers : Flag<["-"], "fmodules-validate-system-headers">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Validate the system headers that a module depends on when loading the module">;
	def fno_modules_validate_system_headers : Flag<["-"], "fno-modules-validate-system-headers">,
	Group<i_Group>, Flags<[DriverOption]>;

	def fvalidate_ast_input_files_content:
	Flag <["-"], "fvalidate-ast-input-files-content">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Compute and store the hash of input files used to build an AST."
	" Files with mismatching mtime's are considered valid"
	" if both contents is identical">;
	def fmodules_validate_input_files_content:
	Flag <["-"], "fmodules-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Validate PCM input files based on content if mtime differs">;
	def fno_modules_validate_input_files_content:
	Flag <["-"], "fno_modules-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>;
	def fpch_validate_input_files_content:
	Flag <["-"], "fpch-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Validate PCH input files based on content if mtime differs">;
	def fno_pch_validate_input_files_content:
	Flag <["-"], "fno_pch-validate-input-files-content">,
	Group<f_Group>, Flags<[DriverOption]>;
	def fpch_instantiate_templates:
	Flag <["-"], "fpch-instantiate-templates">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Instantiate templates already while building a PCH">;
	def fno_pch_instantiate_templates:
	Flag <["-"], "fno-pch-instantiate-templates">,
	Group<f_Group>, Flags<[CC1Option]>;
	defm pch_codegen: OptInFFlag<"pch-codegen", "Generate ", "Do not generate ",
	"code for uses of this PCH that assumes an explicit object file will be built for the PCH">;
	defm pch_debuginfo: OptInFFlag<"pch-debuginfo", "Generate ", "Do not generate ",
	"debug info for types in an object file built from this PCH and do not generate them elsewhere">;

	def fmodules : Flag <["-"], "fmodules">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable the 'modules' language feature">;
	def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Implicitly search the file system for module map files.">;
	def fmodules_ts : Flag <["-"], "fmodules-ts">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable support for the C++ Modules TS">;
	def fmodule_maps : Flag <["-"], "fmodule-maps">, Alias<fimplicit_module_maps>;
	def fmodule_name_EQ : Joined<["-"], "fmodule-name=">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>, MetaVarName<"<name>">,
	HelpText<"Specify the name of the module to build">;
	def fmodule_name : Separate<["-"], "fmodule-name">, Alias<fmodule_name_EQ>;
	def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
	Flags<[CC1Option]>, Alias<fmodule_name_EQ>;
	def fsystem_module : Flag<["-"], "fsystem-module">, Flags<[CC1Option]>,
	HelpText<"Build this module as a system module. Only used with -emit-module">;
	def fmodule_map_file : Joined<["-"], "fmodule-map-file=">,
	Group<f_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"<file>">,
	HelpText<"Load this module map file">;
	def fmodule_file : Joined<["-"], "fmodule-file=">,
	Group<i_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"[<name>=]<file>">,
	HelpText<"Specify the mapping of module name to precompiled module file, or load a module file if name is omitted.">;
	def fmodules_ignore_macro : Joined<["-"], "fmodules-ignore-macro=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Ignore the definition of the given macro when building and loading modules">;
	def fmodules_decluse : Flag <["-"], "fmodules-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Require declaration of modules used within a module">;
	def fmodules_strict_decluse : Flag <["-"], "fmodules-strict-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Like -fmodules-decluse but requires all headers to be in modules">;
	def fno_modules_search_all : Flag <["-"], "fno-modules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>;
	def fno_implicit_modules :
	Flag <["-"], "fno-implicit-modules">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>;
	def fretain_comments_from_system_headers : Flag<["-"], "fretain-comments-from-system-headers">, Group<f_Group>, Flags<[CC1Option]>;

	def fmudflapth : Flag<["-"], "fmudflapth">, Group<f_Group>;
	def fmudflap : Flag<["-"], "fmudflap">, Group<f_Group>;
	def fnested_functions : Flag<["-"], "fnested-functions">, Group<f_Group>;
	def fnext_runtime : Flag<["-"], "fnext-runtime">, Group<f_Group>;
	def fno_apple_pragma_pack : Flag<["-"], "fno-apple-pragma-pack">, Group<f_Group>;
	def fno_asm : Flag<["-"], "fno-asm">, Group<f_Group>;
	def fno_asynchronous_unwind_tables : Flag<["-"], "fno-asynchronous-unwind-tables">, Group<f_Group>;
	def fno_assume_sane_operator_new : Flag<["-"], "fno-assume-sane-operator-new">, Group<f_Group>,
	HelpText<"Don't assume that C++'s global operator new can't alias any pointer">,
	Flags<[CC1Option]>;
	def fno_borland_extensions : Flag<["-"], "fno-borland-extensions">, Group<f_Group>;
	def fno_builtin : Flag<["-"], "fno-builtin">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable implicit builtin knowledge of functions">;
	def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable implicit builtin knowledge of a specific function">;
	def fno_diagnostics_color : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fno_common : Flag<["-"], "fno-common">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Compile common globals like normal definitions">;
	def fno_constant_cfstrings : Flag<["-"], "fno-constant-cfstrings">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Disable creation of CodeFoundation-type constant strings">;
	def fno_cxx_modules : Flag <["-"], "fno-cxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_diagnostics_fixit_info : Flag<["-"], "fno-diagnostics-fixit-info">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include fixit information in diagnostics">;
	def fno_diagnostics_show_hotness : Flag<["-"], "fno-diagnostics-show-hotness">, Group<f_Group>;
	def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_diagnostics_show_note_include_stack : Flag<["-"], "fno-diagnostics-show-note-include-stack">,
	Flags<[CC1Option]>, Group<f_Group>;
	def fdigraphs : Flag<["-"], "fdigraphs">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:' (default)">;
	def fno_digraphs : Flag<["-"], "fno-digraphs">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disallow alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:'">;
	def fno_declspec : Flag<["-"], "fno-declspec">, Group<f_clang_Group>,
	HelpText<"Disallow __declspec as a keyword">, Flags<[CC1Option]>;
	def fno_dollars_in_identifiers : Flag<["-"], "fno-dollars-in-identifiers">, Group<f_Group>,
	HelpText<"Disallow '$' in identifiers">, Flags<[CC1Option]>;
	def fno_elide_constructors : Flag<["-"], "fno-elide-constructors">, Group<f_Group>,
	HelpText<"Disable C++ copy constructor elision">, Flags<[CC1Option]>;
	def fno_eliminate_unused_debug_symbols : Flag<["-"], "fno-eliminate-unused-debug-symbols">, Group<f_Group>;
	def fno_gnu_keywords : Flag<["-"], "fno-gnu-keywords">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_inline_functions : Flag<["-"], "fno-inline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_inline : Flag<["-"], "fno-inline">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_global_isel : Flag<["-"], "fno-global-isel">, Group<f_clang_Group>,
	HelpText<"Disables the global instruction selector">;
	def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_Group>,
	Alias<fno_global_isel>;
	def fno_experimental_new_pass_manager : Flag<["-"], "fno-experimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Disables an experimental new pass manager in LLVM.">;
	def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the given vector functions library">, Values<"Accelerate,MASSV,SVML,none">;
	def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,
	Alias<flax_vector_conversions_EQ>, AliasArgs<["none"]>;
	def fno_merge_all_constants : Flag<["-"], "fno-merge-all-constants">, Group<f_Group>,
	HelpText<"Disallow merging of constants">;
	def fno_modules : Flag <["-"], "fno-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_implicit_module_maps : Flag <["-"], "fno-implicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_maps : Flag <["-"], "fno-module-maps">, Alias<fno_implicit_module_maps>;
	def fno_modules_decluse : Flag <["-"], "fno-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_modules_strict_decluse : Flag <["-"], "fno-strict-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fimplicit_modules : Flag <["-"], "fimplicit-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fmodule_file_deps : Flag <["-"], "fmodule-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_file_deps : Flag <["-"], "fno-module-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_ms_extensions : Flag<["-"], "fno-ms-extensions">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_ms_compatibility : Flag<["-"], "fno-ms-compatibility">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_delayed_template_parsing : Flag<["-"], "fno-delayed-template-parsing">, Group<f_Group>,
	HelpText<"Disable delayed template parsing">,
	Flags<[DriverOption, CoreOption]>;
	def fno_objc_exceptions: Flag<["-"], "fno-objc-exceptions">, Group<f_Group>;
	def fno_objc_legacy_dispatch : Flag<["-"], "fno-objc-legacy-dispatch">, Group<f_Group>;
	def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>;
	def fno_operator_names : Flag<["-"], "fno-operator-names">, Group<f_Group>,
	HelpText<"Do not treat C++ operator name keywords as synonyms for operators">,
	Flags<[CC1Option]>;
	def fno_pascal_strings : Flag<["-"], "fno-pascal-strings">, Group<f_Group>;
	def fno_short_enums : Flag<["-"], "fno-short-enums">, Group<f_Group>;
	def fno_show_source_location : Flag<["-"], "fno-show-source-location">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include source location information with diagnostics">;
	def fdiagnostics_absolute_paths : Flag<["-"], "fdiagnostics-absolute-paths">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Print absolute paths in diagnostics">;
	def fno_spell_checking : Flag<["-"], "fno-spell-checking">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Disable spell-checking">;
	def fno_stack_protector : Flag<["-"], "fno-stack-protector">, Group<f_Group>,
	HelpText<"Disable the use of stack protectors">;
	def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
	def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
	def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
	def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">,
	Group<f_Group>;
	def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
	def fno_temp_file : Flag<["-"], "fno-temp-file">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<
	"Directly create compilation output files. This may lead to incorrect incremental builds if the compiler crashes">;
	def fno_threadsafe_statics : Flag<["-"], "fno-threadsafe-statics">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not emit code to make initialization of local statics thread safe">;
	def fno_use_cxa_atexit : Flag<["-"], "fno-use-cxa-atexit">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use __cxa_atexit for calling destructors">;
	def fno_register_global_dtors_with_atexit : Flag<["-"], "fno-register-global-dtors-with-atexit">, Group<f_Group>,
	HelpText<"Don't use atexit or __cxa_atexit to register global destructors">;
	def fno_unit_at_a_time : Flag<["-"], "fno-unit-at-a-time">, Group<f_Group>;
	def fno_unwind_tables : Flag<["-"], "fno-unwind-tables">, Group<f_Group>;
	def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
	def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>;
	def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Synthesize retain and release calls for Objective-C pointers">;
	def fno_objc_arc : Flag<["-"], "fno-objc-arc">, Group<f_Group>;
	def fobjc_convert_messages_to_runtime_calls :
	Flag<["-"], "fobjc-convert-messages-to-runtime-calls">, Group<f_Group>;
	def fno_objc_convert_messages_to_runtime_calls :
	Flag<["-"], "fno-objc-convert-messages-to-runtime-calls">, Group<f_Group>, Flags<[CC1Option]>;
	def fobjc_arc_exceptions : Flag<["-"], "fobjc-arc-exceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use EH-safe code when synthesizing retains and releases in -fobjc-arc">;
	def fno_objc_arc_exceptions : Flag<["-"], "fno-objc-arc-exceptions">, Group<f_Group>;
	def fobjc_atdefs : Flag<["-"], "fobjc-atdefs">, Group<clang_ignored_f_Group>;
	def fobjc_call_cxx_cdtors : Flag<["-"], "fobjc-call-cxx-cdtors">, Group<clang_ignored_f_Group>;
	def fobjc_exceptions: Flag<["-"], "fobjc-exceptions">, Group<f_Group>,
	HelpText<"Enable Objective-C exceptions">, Flags<[CC1Option]>;
	def fapplication_extension : Flag<["-"], "fapplication-extension">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Restrict code to those available for App Extensions">;
	def fno_application_extension : Flag<["-"], "fno-application-extension">,
	Group<f_Group>;
	def frelaxed_template_template_args : Flag<["-"], "frelaxed-template-template-args">,
	Flags<[CC1Option]>, HelpText<"Enable C++17 relaxed template template argument matching">,
	Group<f_Group>;
	def fno_relaxed_template_template_args : Flag<["-"], "fno-relaxed-template-template-args">,
	Group<f_Group>;
	def fsized_deallocation : Flag<["-"], "fsized-deallocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++14 sized global deallocation functions">, Group<f_Group>;
	def fno_sized_deallocation: Flag<["-"], "fno-sized-deallocation">, Group<f_Group>;
	def faligned_allocation : Flag<["-"], "faligned-allocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++17 aligned allocation functions">, Group<f_Group>;
	def fno_aligned_allocation: Flag<["-"], "fno-aligned-allocation">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fnew_alignment_EQ : Joined<["-"], "fnew-alignment=">,
	HelpText<"Specifies the largest alignment guaranteed by '::operator new(size_t)'">,
	MetaVarName<"<align>">, Group<f_Group>, Flags<[CC1Option]>;
	def : Separate<["-"], "fnew-alignment">, Alias<fnew_alignment_EQ>;
	def : Flag<["-"], "faligned-new">, Alias<faligned_allocation>;
	def : Flag<["-"], "fno-aligned-new">, Alias<fno_aligned_allocation>;
	def faligned_new_EQ : Joined<["-"], "faligned-new=">;

	def fobjc_legacy_dispatch : Flag<["-"], "fobjc-legacy-dispatch">, Group<f_Group>;
	def fobjc_new_property : Flag<["-"], "fobjc-new-property">, Group<clang_ignored_f_Group>;
	def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">,
	Group<f_Group>;
	def fno_objc_infer_related_result_type : Flag<["-"],
	"fno-objc-infer-related-result-type">, Group<f_Group>,
	HelpText<
	"do not infer Objective-C related result type based on method family">,
	Flags<[CC1Option]>;
	def fobjc_link_runtime: Flag<["-"], "fobjc-link-runtime">, Group<f_Group>;
	def fobjc_weak : Flag<["-"], "fobjc-weak">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable ARC-style weak references in Objective-C">;

	// Objective-C ABI options.
	def fobjc_runtime_EQ : Joined<["-"], "fobjc-runtime=">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Specify the target Objective-C runtime kind and version">;
	def fobjc_abi_version_EQ : Joined<["-"], "fobjc-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi_version_EQ : Joined<["-"], "fobjc-nonfragile-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi : Flag<["-"], "fobjc-nonfragile-abi">, Group<f_Group>;
	def fno_objc_nonfragile_abi : Flag<["-"], "fno-objc-nonfragile-abi">, Group<f_Group>;

	def fobjc_sender_dependent_dispatch : Flag<["-"], "fobjc-sender-dependent-dispatch">, Group<f_Group>;
	def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
	def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Parse OpenMP pragmas and generate parallel code.">;
	def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
	def fopenmp_version_EQ : Joined<["-"], "fopenmp-version=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_EQ : Joined<["-"], "fopenmp=">, Group<f_Group>;
	def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
	def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">,
	Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">,
	Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
	def fopenmp_enable_irbuilder : Flag<["-"], "fopenmp-enable-irbuilder">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
	HelpText<"Use the experimental OpenMP-IR-Builder codegen path.">;
	def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_cuda_mode : Flag<["-"], "fopenmp-cuda-mode">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_force_full_runtime : Flag<["-"], "fopenmp-cuda-force-full-runtime">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fno_openmp_cuda_force_full_runtime : Flag<["-"], "fno-openmp-cuda-force-full-runtime">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_number_of_sm_EQ : Joined<["-"], "fopenmp-cuda-number-of-sm=">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_teams_reduction_recs_num_EQ : Joined<["-"], "fopenmp-cuda-teams-reduction-recs-num=">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
	def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>;
	def fopenmp_cuda_parallel_target_regions : Flag<["-"], "fopenmp-cuda-parallel-target-regions">, Group<f_Group>,
	Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
	HelpText<"Support parallel execution of target regions on Cuda-based devices.">;
	def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group<f_Group>,
	Flags<[NoArgumentUnused, HelpHidden]>,
	HelpText<"Support only serial execution of target regions on Cuda-based devices.">;
	def static_openmp: Flag<["-"], "static-openmp">,
	HelpText<"Use the static host OpenMP runtime while linking.">;
	def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
	def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
	def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group<f_Group>, Flags<[CC1Option]>;
	def fescaping_block_tail_calls : Flag<["-"], "fescaping-block-tail-calls">, Group<f_Group>;
	def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;
	def force__flat__namespace : Flag<["-"], "force_flat_namespace">;
	def force__load : Separate<["-"], "force_load">;
	def force_addr : Joined<["-"], "fforce-addr">, Group<clang_ignored_f_Group>;
	def foutput_class_dir_EQ : Joined<["-"], "foutput-class-dir=">, Group<f_Group>;
	def fpack_struct : Flag<["-"], "fpack-struct">, Group<f_Group>;
	def fno_pack_struct : Flag<["-"], "fno-pack-struct">, Group<f_Group>;
	def fpack_struct_EQ : Joined<["-"], "fpack-struct=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the default maximum struct packing alignment">;
	def fmax_type_align_EQ : Joined<["-"], "fmax-type-align=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the maximum alignment to enforce on pointers lacking an explicit alignment">;
	def fno_max_type_align : Flag<["-"], "fno-max-type-align">, Group<f_Group>;
	def fpascal_strings : Flag<["-"], "fpascal-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Recognize and construct Pascal-style string literals">;
	def fpatchable_function_entry_EQ : Joined<["-"], "fpatchable-function-entry=">, Group<f_Group>, Flags<[CC1Option]>,
	MetaVarName<"<N,M>">, HelpText<"Generate M NOPs before function entry and N-M NOPs after function entry">;
	def fpcc_struct_return : Flag<["-"], "fpcc-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return all structs on the stack">;
	def fpch_preprocess : Flag<["-"], "fpch-preprocess">, Group<f_Group>;
	def fpic : Flag<["-"], "fpic">, Group<f_Group>;
	def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
	def fpie : Flag<["-"], "fpie">, Group<f_Group>;
	def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
	defm plt : OptOutFFlag<"plt", "",
	"Use GOT indirection instead of PLT to make external function calls (x86 only)">;
	defm ropi : OptInFFlag<"ropi", "Generate read-only position independent code (ARM only)">;
	defm rwpi : OptInFFlag<"rwpi", "Generate read-write position independent code (ARM only)">;
	def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<dsopath>">,
	HelpText<"Load the named plugin (dynamic shared object)">;
	def fpass_plugin_EQ : Joined<["-"], "fpass-plugin=">,
	Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<dsopath>">,
	HelpText<"Load pass plugin from a dynamic shared object file (only with new pass manager).">;
	defm preserve_as_comments : OptOutFFlag<"preserve-as-comments", "",
	"Do not preserve comments in inline assembly">;
	def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group<f_Group>;
	def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group<f_Group>;
	def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>;
	def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group<clang_ignored_f_Group>;
	def freg_struct_return : Flag<["-"], "freg-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return small structs in registers">;
	defm rtti : OptOutFFlag<"rtti", "", "Disable generation of rtti information">;
	defm rtti_data : OptOutFFlag<"rtti-data", "", "Disable generation of RTTI data">;
	def : Flag<["-"], "fsched-interblock">, Group<clang_ignored_f_Group>;
	def fshort_enums : Flag<["-"], "fshort-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allocate to an enum type only as many bytes as it needs for the declared range of possible values">;
	def fchar8__t : Flag<["-"], "fchar8_t">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable C++ builtin type char8_t">;
	def fno_char8__t : Flag<["-"], "fno-char8_t">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable C++ builtin type char8_t">;
	def fshort_wchar : Flag<["-"], "fshort-wchar">, Group<f_Group>,
	HelpText<"Force wchar_t to be a short unsigned int">;
	def fno_short_wchar : Flag<["-"], "fno-short-wchar">, Group<f_Group>,
	HelpText<"Force wchar_t to be an unsigned int">;
	def fshow_overloads_EQ : Joined<["-"], "fshow-overloads=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Which overload candidates to show when overload resolution fails: "
	"best\|all; defaults to all">, Values<"best,all">;
	defm show_column : OptOutFFlag<"show-column", "", "Do not include column number on diagnostics">;
	def fshow_source_location : Flag<["-"], "fshow-source-location">, Group<f_Group>;
	def fspell_checking : Flag<["-"], "fspell-checking">, Group<f_Group>;
	def fspell_checking_limit_EQ : Joined<["-"], "fspell-checking-limit=">, Group<f_Group>;
	def fsigned_bitfields : Flag<["-"], "fsigned-bitfields">, Group<f_Group>;
	defm signed_char : OptOutFFlag<"signed-char", "char is signed", "char is unsigned">;
	def fsplit_stack : Flag<["-"], "fsplit-stack">, Group<f_Group>;
	def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
	HelpText<"Enable stack protectors for all functions">;
	def fstack_clash_protection : Flag<["-"], "fstack-clash-protection">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable stack clash protection">;
	-def fnostack_clash_protection : Flag<["-"], "fnostack-clash-protection">, Group<f_Group>,
	+def fno_stack_clash_protection : Flag<["-"], "fno-stack-clash-protection">, Group<f_Group>,
	HelpText<"Disable stack clash protection">;
	def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
	HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. "
	"Compared to -fstack-protector, this uses a stronger heuristic "
	"that includes functions containing arrays of any size (and any type), "
	"as well as any calls to alloca or the taking of an address from a local variable">;
	def fstack_protector : Flag<["-"], "fstack-protector">, Group<f_Group>,
	HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. "
	"This uses a loose heuristic which considers functions vulnerable "
	"if they contain a char (or 8bit integer) array or constant sized calls to "
	"alloca, which are of greater size than ssp-buffer-size (default: 8 bytes). "
	"All variable sized calls to alloca are considered vulnerable">;
	def ftrivial_auto_var_init : Joined<["-"], "ftrivial-auto-var-init=">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Initialize trivial automatic stack variables: uninitialized (default)"
	" \| pattern">, Values<"uninitialized,pattern">;
	def enable_trivial_var_init_zero : Flag<["-"], "enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang">,
	Flags<[CC1Option, CoreOption]>,
	HelpText<"Trivial automatic variable initialization to zero is only here for benchmarks, it'll eventually be removed, and I'm OK with that because I'm only using it to benchmark">;
	def ftrivial_auto_var_init_stop_after : Joined<["-"], "ftrivial-auto-var-init-stop-after=">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Stop initializing trivial automatic stack variables after the specified number of instances">;
	def fstandalone_debug : Flag<["-"], "fstandalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit full debug info for all types used by the program">;
	def fno_standalone_debug : Flag<["-"], "fno-standalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Limit debug information produced to reduce size of debug binary">;
	def flimit_debug_info : Flag<["-"], "flimit-debug-info">, Flags<[CoreOption]>, Alias<fno_standalone_debug>;
	def fno_limit_debug_info : Flag<["-"], "fno-limit-debug-info">, Flags<[CoreOption]>, Alias<fstandalone_debug>;
	def fdebug_macro : Flag<["-"], "fdebug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit macro debug information">;
	def fno_debug_macro : Flag<["-"], "fno-debug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Do not emit macro debug information">;
	def fstrict_aliasing : Flag<["-"], "fstrict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstrict_enums : Flag<["-"], "fstrict-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict definition of an enum's "
	"value range">;
	def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict rules for overwriting "
	"polymorphic C++ objects">;
	def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>;
	def fsyntax_only : Flag<["-"], "fsyntax-only">,
	Flags<[DriverOption,CoreOption,CC1Option]>, Group<Action_Group>;
	def ftabstop_EQ : Joined<["-"], "ftabstop=">, Group<f_Group>;
	def ftemplate_depth_EQ : Joined<["-"], "ftemplate-depth=">, Group<f_Group>;
	def ftemplate_depth_ : Joined<["-"], "ftemplate-depth-">, Group<f_Group>;
	def ftemplate_backtrace_limit_EQ : Joined<["-"], "ftemplate-backtrace-limit=">,
	Group<f_Group>;
	def foperator_arrow_depth_EQ : Joined<["-"], "foperator-arrow-depth=">,
	Group<f_Group>;

	def fsave_optimization_record : Flag<["-"], "fsave-optimization-record">,
	Group<f_Group>, HelpText<"Generate a YAML optimization record file">;
	def fsave_optimization_record_EQ : Joined<["-"], "fsave-optimization-record=">,
	Group<f_Group>, HelpText<"Generate an optimization record file in a specific format">,
	MetaVarName<"<format>">;
	def fno_save_optimization_record : Flag<["-"], "fno-save-optimization-record">,
	Group<f_Group>, Flags<[NoArgumentUnused]>;
	def foptimization_record_file_EQ : Joined<["-"], "foptimization-record-file=">,
	Group<f_Group>,
	HelpText<"Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.">,
	MetaVarName<"<file>">;
	def foptimization_record_passes_EQ : Joined<["-"], "foptimization-record-passes=">,
	Group<f_Group>,
	HelpText<"Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)">,
	MetaVarName<"<regex>">;

	def ftest_coverage : Flag<["-"], "ftest-coverage">, Group<f_Group>;
	def fno_test_coverage : Flag<["-"], "fno-test-coverage">, Group<f_Group>;
	def fvectorize : Flag<["-"], "fvectorize">, Group<f_Group>,
	HelpText<"Enable the loop vectorization passes">;
	def fno_vectorize : Flag<["-"], "fno-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-vectorize">, Alias<fvectorize>;
	def : Flag<["-"], "fno-tree-vectorize">, Alias<fno_vectorize>;
	def fslp_vectorize : Flag<["-"], "fslp-vectorize">, Group<f_Group>,
	HelpText<"Enable the superword-level parallelism vectorization passes">;
	def fno_slp_vectorize : Flag<["-"], "fno-slp-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-slp-vectorize">, Alias<fslp_vectorize>;
	def : Flag<["-"], "fno-tree-slp-vectorize">, Alias<fno_slp_vectorize>;
	def Wlarge_by_value_copy_def : Flag<["-"], "Wlarge-by-value-copy">,
	HelpText<"Warn if a function definition returns or accepts an object larger "
	"in bytes than a given value">, Flags<[HelpHidden]>;
	def Wlarge_by_value_copy_EQ : Joined<["-"], "Wlarge-by-value-copy=">, Flags<[CC1Option]>;

	// These "special" warning flags are effectively processed as f_Group flags by the driver:
	// Just silence warnings about -Wlarger-than for now.
	def Wlarger_than_EQ : Joined<["-"], "Wlarger-than=">, Group<clang_ignored_f_Group>;
	def Wlarger_than_ : Joined<["-"], "Wlarger-than-">, Alias<Wlarger_than_EQ>;
	def Wframe_larger_than_EQ : Joined<["-"], "Wframe-larger-than=">, Group<f_Group>, Flags<[DriverOption]>;

	def : Flag<["-"], "fterminated-vtables">, Alias<fapple_kext>;
	def fthreadsafe_statics : Flag<["-"], "fthreadsafe-statics">, Group<f_Group>;
	def ftime_report : Flag<["-"], "ftime-report">, Group<f_Group>, Flags<[CC1Option]>;
	def ftime_trace : Flag<["-"], "ftime-trace">, Group<f_Group>,
	HelpText<"Turn on time profiler. Generates JSON file based on output filename.">,
	DocBrief<[{
	Turn on time profiler. Generates JSON file based on output filename. Results
	can be analyzed with chrome://tracing or `Speedscope App
	<https://www.speedscope.app>`_ for flamegraph visualization.}]>,
	Flags<[CC1Option, CoreOption]>;
	def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Group<f_Group>,
	HelpText<"Minimum time granularity (in microseconds) traced by time profiler">,
	Flags<[CC1Option, CoreOption]>;
	def ftlsmodel_EQ : Joined<["-"], "ftls-model=">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrapv : Flag<["-"], "ftrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Trap on integer overflow">;
	def ftrapv_handler_EQ : Joined<["-"], "ftrapv-handler=">, Group<f_Group>,
	MetaVarName<"<function name>">,
	HelpText<"Specify the function to be called on overflow">;
	def ftrapv_handler : Separate<["-"], "ftrapv-handler">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrap_function_EQ : Joined<["-"], "ftrap-function=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Issue call to specified function rather than a trap instruction">;
	def funit_at_a_time : Flag<["-"], "funit-at-a-time">, Group<f_Group>;
	def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop unroller">, Flags<[CC1Option]>;
	def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop unroller">, Flags<[CC1Option]>;
	defm reroll_loops : OptInFFlag<"reroll-loops", "Turn on loop reroller">;
	def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
	HelpText<"Process trigraph sequences">, Flags<[CC1Option]>;
	def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
	HelpText<"Do not process trigraph sequences">, Flags<[CC1Option]>;
	def funsigned_bitfields : Flag<["-"], "funsigned-bitfields">, Group<f_Group>;
	def funsigned_char : Flag<["-"], "funsigned-char">, Group<f_Group>;
	def fno_unsigned_char : Flag<["-"], "fno-unsigned-char">;
	def funwind_tables : Flag<["-"], "funwind-tables">, Group<f_Group>;
	def fuse_cxa_atexit : Flag<["-"], "fuse-cxa-atexit">, Group<f_Group>;
	def fregister_global_dtors_with_atexit : Flag<["-"], "fregister-global-dtors-with-atexit">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use atexit or __cxa_atexit to register global destructors">;
	defm use_init_array : OptOutFFlag<"use-init-array", "", "Use .ctors/.dtors instead of .init_array/.fini_array">;
	def fno_var_tracking : Flag<["-"], "fno-var-tracking">, Group<clang_ignored_f_Group>;
	def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>,
	HelpText<"Generate verbose assembly output">;
	def dA : Flag<["-"], "dA">, Alias<fverbose_asm>;
	def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
	HelpText<"Set the default symbol visibility for all global declarations">, Values<"hidden,default">;
	def fvisibility_inlines_hidden : Flag<["-"], "fvisibility-inlines-hidden">, Group<f_Group>,
	HelpText<"Give inline C++ member functions hidden visibility by default">,
	Flags<[CC1Option]>;
	def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>,
	HelpText<"Give global types 'default' visibility and global functions and "
	"variables 'hidden' visibility by default">;
	def fvisibility_global_new_delete_hidden : Flag<["-"], "fvisibility-global-new-delete-hidden">, Group<f_Group>,
	HelpText<"Give global C++ operator new and delete declarations hidden visibility">, Flags<[CC1Option]>;
	defm whole_program_vtables : OptInFFlag<"whole-program-vtables",
	"Enables whole-program vtable optimization. Requires -flto", "", "", [CoreOption]>;
	defm split_lto_unit : OptInFFlag<"split-lto-unit",
	"Enables splitting of the LTO unit", "", "", [CoreOption]>;
	defm force_emit_vtables : OptInFFlag<"force-emit-vtables",
	"Emits more virtual tables to improve devirtualization", "", "", [CoreOption]>;
	defm virtual_function_elimination : OptInFFlag<"virtual-function-elimination",
	"Enables dead virtual function elimination optimization. Requires -flto=full", "", "", [CoreOption]>;

	def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat signed integer overflow as two's complement">;
	def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Store string literals as writable data">;
	defm zero_initialized_in_bss : OptOutFFlag<"zero-initialized-in-bss", "", "Don't place zero initialized data in BSS">;
	defm function_sections : OptInFFlag<"function-sections", "Place each function in its own section">;
	def fbasic_block_sections_EQ : Joined<["-"], "fbasic-block-sections=">, Group<f_Group>,
	Flags<[CC1Option, CC1AsOption]>,
	HelpText<"Place each function's basic blocks in unique sections (ELF Only) : all \| labels \| none \| list=<file>">,
	DocBrief<[{Generate labels for each basic block or place each basic block or a subset of basic blocks in its own section.}]>,
	Values<"all,labels,none,list=">;
	defm data_sections : OptInFFlag<"data-sections", "Place each data in its own section">;
	defm stack_size_section : OptInFFlag<"stack-size-section", "Emit section containing metadata on function stack sizes">;

	defm unique_basic_block_section_names : OptInFFlag<"unique-basic-block-section-names",
	"Use unique names for basic block sections (ELF Only)">;
	defm unique_internal_linkage_names : OptInFFlag<"unique-internal-linkage-names",
	"Uniqueify Internal Linkage Symbol Names by appending the MD5 hash of the module path">;
	defm unique_section_names : OptOutFFlag<"unique-section-names",
	"", "Don't use unique names for text and data sections">;

	defm strict_return : OptOutFFlag<"strict-return", "",
	"Don't treat control flow paths that fall off the end of a non-void function as unreachable">;

	def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Enable matrix data type and related builtin functions">;


	def fdebug_types_section: Flag <["-"], "fdebug-types-section">, Group<f_Group>,
	HelpText<"Place debug types in their own section (ELF Only)">;
	def fno_debug_types_section: Flag<["-"], "fno-debug-types-section">, Group<f_Group>;
	defm debug_ranges_base_address : OptInFFlag<"debug-ranges-base-address",
	"Use DWARF base address selection entries in .debug_ranges">;
	def fsplit_dwarf_inlining: Flag <["-"], "fsplit-dwarf-inlining">, Group<f_Group>,
	HelpText<"Provide minimal debug info in the object/executable to facilitate online symbolication/stack traces in the absence of .dwo/.dwp files when using Split DWARF">;
	def fno_split_dwarf_inlining: Flag<["-"], "fno-split-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fdebug_default_version: Joined<["-"], "fdebug-default-version=">, Group<f_Group>,
	HelpText<"Default DWARF version to use, if a -g option caused DWARF debug info to be produced">;
	def fdebug_prefix_map_EQ
	: Joined<["-"], "fdebug-prefix-map=">, Group<f_Group>,
	Flags<[CC1Option,CC1AsOption]>,
	HelpText<"remap file source paths in debug info">;
	def ffile_prefix_map_EQ
	: Joined<["-"], "ffile-prefix-map=">, Group<f_Group>,
	HelpText<"remap file source paths in debug info and predefined preprocessor macros">;
	def fmacro_prefix_map_EQ
	: Joined<["-"], "fmacro-prefix-map=">, Group<Preprocessor_Group>, Flags<[CC1Option]>,
	HelpText<"remap file source paths in predefined preprocessor macros">;
	defm force_dwarf_frame : OptInFFlag<"force-dwarf-frame", "Always emit a debug frame section">;
	def g_Flag : Flag<["-"], "g">, Group<g_Group>,
	HelpText<"Generate source-level debug information">;
	def gline_tables_only : Flag<["-"], "gline-tables-only">, Group<gN_Group>,
	Flags<[CoreOption]>, HelpText<"Emit debug line number tables only">;
	def gline_directives_only : Flag<["-"], "gline-directives-only">, Group<gN_Group>,
	Flags<[CoreOption]>, HelpText<"Emit debug line info directives only">;
	def gmlt : Flag<["-"], "gmlt">, Alias<gline_tables_only>;
	def g0 : Flag<["-"], "g0">, Group<gN_Group>;
	def g1 : Flag<["-"], "g1">, Group<gN_Group>, Alias<gline_tables_only>;
	def g2 : Flag<["-"], "g2">, Group<gN_Group>;
	def g3 : Flag<["-"], "g3">, Group<gN_Group>;
	def ggdb : Flag<["-"], "ggdb">, Group<gTune_Group>;
	def ggdb0 : Flag<["-"], "ggdb0">, Group<ggdbN_Group>;
	def ggdb1 : Flag<["-"], "ggdb1">, Group<ggdbN_Group>;
	def ggdb2 : Flag<["-"], "ggdb2">, Group<ggdbN_Group>;
	def ggdb3 : Flag<["-"], "ggdb3">, Group<ggdbN_Group>;
	def glldb : Flag<["-"], "glldb">, Group<gTune_Group>;
	def gsce : Flag<["-"], "gsce">, Group<gTune_Group>;
	// Equivalent to our default dwarf version. Forces usual dwarf emission when
	// CodeView is enabled.
	def gdwarf : Flag<["-"], "gdwarf">, Group<g_Group>, Flags<[CoreOption]>,
	HelpText<"Generate source-level debug information with the default dwarf version">;
	def gdwarf_2 : Flag<["-"], "gdwarf-2">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 2">;
	def gdwarf_3 : Flag<["-"], "gdwarf-3">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 3">;
	def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 4">;
	def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 5">;

	def gcodeview : Flag<["-"], "gcodeview">,
	HelpText<"Generate CodeView debug information">,
	Flags<[CC1Option, CC1AsOption, CoreOption]>;
	def gcodeview_ghash : Flag<["-"], "gcodeview-ghash">,
	HelpText<"Emit type record hashes in a .debug$H section">,
	Flags<[CC1Option, CoreOption]>;
	def gno_codeview_ghash : Flag<["-"], "gno-codeview-ghash">, Flags<[CoreOption]>;
	def ginline_line_tables : Flag<["-"], "ginline-line-tables">, Flags<[CoreOption]>;
	def gno_inline_line_tables : Flag<["-"], "gno-inline-line-tables">,
	Flags<[CC1Option, CoreOption]>, HelpText<"Don't emit inline line tables">;

	def gfull : Flag<["-"], "gfull">, Group<g_Group>;
	def gused : Flag<["-"], "gused">, Group<g_Group>;
	def gstabs : Joined<["-"], "gstabs">, Group<g_Group>, Flags<[Unsupported]>;
	def gcoff : Joined<["-"], "gcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gxcoff : Joined<["-"], "gxcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gvms : Joined<["-"], "gvms">, Group<g_Group>, Flags<[Unsupported]>;
	def gtoggle : Flag<["-"], "gtoggle">, Group<g_flags_Group>, Flags<[Unsupported]>;
	def grecord_command_line : Flag<["-"], "grecord-command-line">,
	Group<g_flags_Group>;
	def gno_record_command_line : Flag<["-"], "gno-record-command-line">,
	Group<g_flags_Group>;
	def : Flag<["-"], "grecord-gcc-switches">, Alias<grecord_command_line>;
	def : Flag<["-"], "gno-record-gcc-switches">, Alias<gno_record_command_line>;
	def gstrict_dwarf : Flag<["-"], "gstrict-dwarf">, Group<g_flags_Group>;
	def gno_strict_dwarf : Flag<["-"], "gno-strict-dwarf">, Group<g_flags_Group>;
	def gcolumn_info : Flag<["-"], "gcolumn-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gno_column_info : Flag<["-"], "gno-column-info">, Group<g_flags_Group>, Flags<[CoreOption, CC1Option]>;
	def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
	def gsplit_dwarf_EQ : Joined<["-"], "gsplit-dwarf=">, Group<g_flags_Group>,
	HelpText<"Set DWARF fission mode to either 'split' or 'single'">,
	Values<"split,single">;
	def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gno_gnu_pubnames : Flag<["-"], "gno-gnu-pubnames">, Group<g_flags_Group>;
	def gpubnames : Flag<["-"], "gpubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gno_pubnames : Flag<["-"], "gno-pubnames">, Group<g_flags_Group>;
	def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
	def gmodules : Flag <["-"], "gmodules">, Group<gN_Group>,
	HelpText<"Generate debug info with external references to clang modules"
	" or precompiled headers">;
	def gz : Flag<["-"], "gz">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def gz_EQ : Joined<["-"], "gz=">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def gembed_source : Flag<["-"], "gembed-source">, Group<g_flags_Group>, Flags<[CC1Option]>,
	HelpText<"Embed source text in DWARF debug sections">;
	def gno_embed_source : Flag<["-"], "gno-embed-source">, Group<g_flags_Group>,
	Flags<[DriverOption]>,
	HelpText<"Restore the default behavior of not embedding source text in DWARF debug sections">;
	def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
	def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"Display available options">;
	def ibuiltininc : Flag<["-"], "ibuiltininc">,
	HelpText<"Enable builtin #include directories even when -nostdinc is used "
	"before or after -ibuiltininc. "
	"Using -nobuiltininc after the option disables it">;
	def index_header_map : Flag<["-"], "index-header-map">, Flags<[CC1Option]>,
	HelpText<"Make the next included directory (-I or -F) an indexer header map">;
	def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to AFTER include search path">;
	def iframework : JoinedOrSeparate<["-"], "iframework">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM framework search path">;
	def iframeworkwithsysroot : JoinedOrSeparate<["-"], "iframeworkwithsysroot">,
	Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM framework search path, "
	"absolute paths are relative to -isysroot">,
	MetaVarName<"<directory>">, Flags<[CC1Option]>;
	def imacros : JoinedOrSeparate<["-", "--"], "imacros">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include macros from file before parsing">, MetaVarName<"<file>">;
	def image__base : Separate<["-"], "image_base">;
	def include_ : JoinedOrSeparate<["-", "--"], "include">, Group<clang_i_Group>, EnumName<"include">,
	MetaVarName<"<file>">, HelpText<"Include file before parsing">, Flags<[CC1Option]>;
	def include_pch : Separate<["-"], "include-pch">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include precompiled header file">, MetaVarName<"<file>">;
	def relocatable_pch : Flag<["-", "--"], "relocatable-pch">, Flags<[CC1Option]>,
	HelpText<"Whether to build a relocatable precompiled header">;
	def verify_pch : Flag<["-"], "verify-pch">, Group<Action_Group>, Flags<[CC1Option]>,
	HelpText<"Load and verify that a pre-compiled header file is not stale">;
	def init : Separate<["-"], "init">;
	def install__name : Separate<["-"], "install_name">;
	def iprefix : JoinedOrSeparate<["-"], "iprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the -iwithprefix/-iwithprefixbefore prefix">, MetaVarName<"<dir>">;
	def iquote : JoinedOrSeparate<["-"], "iquote">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to QUOTE include search path">, MetaVarName<"<directory>">;
	def isysroot : JoinedOrSeparate<["-"], "isysroot">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the system root directory (usually /)">, MetaVarName<"<dir>">;
	def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>,
	Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM include search path">, MetaVarName<"<directory>">;
	def isystem_after : JoinedOrSeparate<["-"], "isystem-after">,
	Group<clang_i_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
	HelpText<"Add directory to end of the SYSTEM include search path">;
	def iwithprefixbefore : JoinedOrSeparate<["-"], "iwithprefixbefore">, Group<clang_i_Group>,
	HelpText<"Set directory to include search path with prefix">, MetaVarName<"<dir>">,
	Flags<[CC1Option]>;
	def iwithprefix : JoinedOrSeparate<["-"], "iwithprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set directory to SYSTEM include search path with prefix">, MetaVarName<"<dir>">;
	def iwithsysroot : JoinedOrSeparate<["-"], "iwithsysroot">, Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM include search path, "
	"absolute paths are relative to -isysroot">, MetaVarName<"<directory>">,
	Flags<[CC1Option]>;
	def ivfsoverlay : JoinedOrSeparate<["-"], "ivfsoverlay">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Overlay the virtual filesystem described by file over the real file system">;
	def imultilib : Separate<["-"], "imultilib">, Group<gfortran_Group>;
	def keep__private__externs : Flag<["-"], "keep_private_externs">;
	def l : JoinedOrSeparate<["-"], "l">, Flags<[LinkerInput, RenderJoined]>,
	Group<Link_Group>;
	def lazy__framework : Separate<["-"], "lazy_framework">, Flags<[LinkerInput]>;
	def lazy__library : Separate<["-"], "lazy_library">, Flags<[LinkerInput]>;
	def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[DriverOption]>;
	def EL : Flag<["-"], "EL">, Alias<mlittle_endian>;
	def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[DriverOption]>;
	def EB : Flag<["-"], "EB">, Alias<mbig_endian>;
	def m16 : Flag<["-"], "m16">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def m32 : Flag<["-"], "m32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mqdsp6_compat : Flag<["-"], "mqdsp6-compat">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Enable hexagon-qdsp6 backward compatibility">;
	def m64 : Flag<["-"], "m64">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mx32 : Flag<["-"], "mx32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mabi_EQ : Joined<["-"], "mabi=">, Group<m_Group>;
	def miamcu : Flag<["-"], "miamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Use Intel MCU ABI">;
	def mno_iamcu : Flag<["-"], "mno-iamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
	def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
	def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
	def malign_branch_EQ : CommaJoined<["-"], "malign-branch=">, Group<m_Group>, Flags<[DriverOption]>,
	HelpText<"Specify types of branches to align">;
	def malign_branch_boundary_EQ : Joined<["-"], "malign-branch-boundary=">, Group<m_Group>, Flags<[DriverOption]>,
	HelpText<"Specify the boundary's size to align branches">;
	def mpad_max_prefix_size_EQ : Joined<["-"], "mpad-max-prefix-size=">, Group<m_Group>, Flags<[DriverOption]>,
	HelpText<"Specify maximum number of prefixes to use for padding">;
	def mbranches_within_32B_boundaries : Flag<["-"], "mbranches-within-32B-boundaries">, Flags<[DriverOption]>, Group<m_Group>,
	HelpText<"Align selected branches (fused, jcc, jmp) within 32-byte boundary">;
	def mfancy_math_387 : Flag<["-"], "mfancy-math-387">, Group<clang_ignored_m_Group>;
	def mlong_calls : Flag<["-"], "mlong-calls">, Group<m_Group>,
	HelpText<"Generate branches with extended addressability, usually via indirect jumps.">;
	def mdouble_EQ : Joined<["-"], "mdouble=">, Group<m_Group>, Values<"32,64">, Flags<[CC1Option]>,
	HelpText<"Force double to be 32 bits or 64 bits">;
	def LongDouble_Group : OptionGroup<"<LongDouble group>">, Group<m_Group>,
	DocName<"Long double flags">,
	DocBrief<[{Selects the long double implementation}]>;
	def mlong_double_64 : Flag<["-"], "mlong-double-64">, Group<LongDouble_Group>, Flags<[CC1Option]>,
	HelpText<"Force long double to be 64 bits">;
	def mlong_double_80 : Flag<["-"], "mlong-double-80">, Group<LongDouble_Group>, Flags<[CC1Option]>,
	HelpText<"Force long double to be 80 bits, padded to 128 bits for storage">;
	def mlong_double_128 : Flag<["-"], "mlong-double-128">, Group<LongDouble_Group>, Flags<[CC1Option]>,
	HelpText<"Force long double to be 128 bits">;
	def mno_long_calls : Flag<["-"], "mno-long-calls">, Group<m_Group>,
	HelpText<"Restore the default behaviour of not generating long calls">;
	def mexecute_only : Flag<["-"], "mexecute-only">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of data access to code sections (ARM only)">;
	def mno_execute_only : Flag<["-"], "mno-execute-only">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of data access to code sections (ARM only)">;
	def mtp_mode_EQ : Joined<["-"], "mtp=">, Group<m_arm_Features_Group>, Values<"soft,cp15,el0,el1,el2,el3">,
	HelpText<"Thread pointer access method (AArch32/AArch64 only)">;
	def mpure_code : Flag<["-"], "mpure-code">, Alias<mexecute_only>; // Alias for GCC compatibility
	def mno_pure_code : Flag<["-"], "mno-pure-code">, Alias<mno_execute_only>;
	def mtvos_version_min_EQ : Joined<["-"], "mtvos-version-min=">, Group<m_Group>;
	def mappletvos_version_min_EQ : Joined<["-"], "mappletvos-version-min=">, Alias<mtvos_version_min_EQ>;
	def mtvos_simulator_version_min_EQ : Joined<["-"], "mtvos-simulator-version-min=">;
	def mappletvsimulator_version_min_EQ : Joined<["-"], "mappletvsimulator-version-min=">, Alias<mtvos_simulator_version_min_EQ>;
	def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group<m_Group>;
	def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">;
	def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
	def march_EQ : Joined<["-"], "march=">, Group<m_Group>, Flags<[CoreOption]>;
	def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
	def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>, Flags<[CC1Option]>;
	def mtls_size_EQ : Joined<["-"], "mtls-size=">, Group<m_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): "
	"12 (for 4KB) \| 24 (for 16MB, default) \| 32 (for 4GB) \| 48 (for 256TB, needs -mcmodel=large)">;
	def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
	def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group<m_Group>;
	def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group<m_Group>;
	def mconstant_cfstrings : Flag<["-"], "mconstant-cfstrings">, Group<clang_ignored_m_Group>;
	def mconsole : Joined<["-"], "mconsole">, Group<m_Group>, Flags<[DriverOption]>;
	def mwindows : Joined<["-"], "mwindows">, Group<m_Group>, Flags<[DriverOption]>;
	def mdll : Joined<["-"], "mdll">, Group<m_Group>, Flags<[DriverOption]>;
	def municode : Joined<["-"], "municode">, Group<m_Group>, Flags<[DriverOption]>;
	def mthreads : Joined<["-"], "mthreads">, Group<m_Group>, Flags<[DriverOption]>;
	def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>;
	def mmcu_EQ : Joined<["-"], "mmcu=">, Group<m_Group>;
	def mdynamic_no_pic : Joined<["-"], "mdynamic-no-pic">, Group<m_Group>;
	def mfix_and_continue : Flag<["-"], "mfix-and-continue">, Group<clang_ignored_m_Group>;
	def mieee_fp : Flag<["-"], "mieee-fp">, Group<clang_ignored_m_Group>;
	def minline_all_stringops : Flag<["-"], "minline-all-stringops">, Group<clang_ignored_m_Group>;
	def mno_inline_all_stringops : Flag<["-"], "mno-inline-all-stringops">, Group<clang_ignored_m_Group>;
	def malign_double : Flag<["-"], "malign-double">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Align doubles to two words in structs (x86 only)">;
	def mfloat_abi_EQ : Joined<["-"], "mfloat-abi=">, Group<m_Group>, Values<"soft,softfp,hard">;
	def mfpmath_EQ : Joined<["-"], "mfpmath=">, Group<m_Group>;
	def mfpu_EQ : Joined<["-"], "mfpu=">, Group<m_Group>;
	def mhwdiv_EQ : Joined<["-"], "mhwdiv=">, Group<m_Group>;
	def mhwmult_EQ : Joined<["-"], "mhwmult=">, Group<m_Group>;
	def mglobal_merge : Flag<["-"], "mglobal-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Enable merging of globals">;
	def mhard_float : Flag<["-"], "mhard-float">, Group<m_Group>;
	def miphoneos_version_min_EQ : Joined<["-"], "miphoneos-version-min=">, Group<m_Group>;
	def mios_version_min_EQ : Joined<["-"], "mios-version-min=">,
	Alias<miphoneos_version_min_EQ>, HelpText<"Set iOS deployment target">;
	def mios_simulator_version_min_EQ : Joined<["-"], "mios-simulator-version-min=">;
	def miphonesimulator_version_min_EQ : Joined<["-"], "miphonesimulator-version-min=">, Alias<mios_simulator_version_min_EQ>;
	def mkernel : Flag<["-"], "mkernel">, Group<m_Group>;
	def mlinker_version_EQ : Joined<["-"], "mlinker-version=">,
	Flags<[DriverOption]>;
	def mllvm : Separate<["-"], "mllvm">, Flags<[CC1Option,CC1AsOption,CoreOption]>,
	HelpText<"Additional arguments to forward to LLVM's option processing">;
	def mmacosx_version_min_EQ : Joined<["-"], "mmacosx-version-min=">,
	Group<m_Group>, HelpText<"Set Mac OS X deployment target">;
	def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
	Group<m_Group>, Alias<mmacosx_version_min_EQ>;
	def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
	def moutline : Flag<["-"], "moutline">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enable function outlining (AArch64 only)">;
	def mno_outline : Flag<["-"], "mno-outline">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Disable function outlining (AArch64 only)">;
	def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
	HelpText<"Do not set the default structure layout to be compatible with the Microsoft compiler standard">;
	def mstackrealign : Flag<["-"], "mstackrealign">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Force realign the stack at entry to every function">;
	def mstack_alignment : Joined<["-"], "mstack-alignment=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack alignment">;
	def mstack_probe_size : Joined<["-"], "mstack-probe-size=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack probe size">;
	def mstack_arg_probe : Flag<["-"], "mstack-arg-probe">, Group<m_Group>,
	HelpText<"Enable stack probes">;
	def mno_stack_arg_probe : Flag<["-"], "mno-stack-arg-probe">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable stack probes which are enabled by default">;
	def mthread_model : Separate<["-"], "mthread-model">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"The thread model to use, e.g. posix, single (posix by default)">, Values<"posix,single">;
	def meabi : Separate<["-"], "meabi">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set EABI type, e.g. 4, 5 or gnu (default depends on triple)">, Values<"default,4,5,gnu">;

	def mno_constant_cfstrings : Flag<["-"], "mno-constant-cfstrings">, Group<m_Group>;
	def mno_global_merge : Flag<["-"], "mno-global-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable merging of globals">;
	def mno_pascal_strings : Flag<["-"], "mno-pascal-strings">,
	Alias<fno_pascal_strings>;
	def mno_red_zone : Flag<["-"], "mno-red-zone">, Group<m_Group>;
	def mno_tls_direct_seg_refs : Flag<["-"], "mno-tls-direct-seg-refs">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable direct TLS access through segment registers">;
	def mno_relax_all : Flag<["-"], "mno-relax-all">, Group<m_Group>;
	def mno_rtd: Flag<["-"], "mno-rtd">, Group<m_Group>;
	def mno_soft_float : Flag<["-"], "mno-soft-float">, Group<m_Group>;
	def mno_stackrealign : Flag<["-"], "mno-stackrealign">, Group<m_Group>;

	def mretpoline : Flag<["-"], "mretpoline">, Group<m_Group>, Flags<[CoreOption,DriverOption]>;
	def mno_retpoline : Flag<["-"], "mno-retpoline">, Group<m_Group>, Flags<[CoreOption,DriverOption]>;
	def mspeculative_load_hardening : Flag<["-"], "mspeculative-load-hardening">,
	Group<m_Group>, Flags<[CoreOption,CC1Option]>;
	def mno_speculative_load_hardening : Flag<["-"], "mno-speculative-load-hardening">,
	Group<m_Group>, Flags<[CoreOption]>;
	def mlvi_hardening : Flag<["-"], "mlvi-hardening">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	HelpText<"Enable all mitigations for Load Value Injection (LVI)">;
	def mno_lvi_hardening : Flag<["-"], "mno-lvi-hardening">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	HelpText<"Disable mitigations for Load Value Injection (LVI)">;
	def mlvi_cfi : Flag<["-"], "mlvi-cfi">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	HelpText<"Enable only control-flow mitigations for Load Value Injection (LVI)">;
	def mno_lvi_cfi : Flag<["-"], "mno-lvi-cfi">, Group<m_Group>, Flags<[CoreOption,DriverOption]>,
	HelpText<"Disable control-flow mitigations for Load Value Injection (LVI)">;
	def m_seses : Flag<["-"], "mseses">, Group<m_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Enable speculative execution side effect suppression (SESES). "
	"Includes LVI control flow integrity mitigations">;
	def mno_seses : Flag<["-"], "mno-seses">, Group<m_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable speculative execution side effect suppression (SESES)">;

	def mrelax : Flag<["-"], "mrelax">, Group<m_riscv_Features_Group>,
	HelpText<"Enable linker relaxation">;
	def mno_relax : Flag<["-"], "mno-relax">, Group<m_riscv_Features_Group>,
	HelpText<"Disable linker relaxation">;
	def msmall_data_limit_EQ : Joined<["-"], "msmall-data-limit=">, Group<m_Group>,
	Alias<G>,
	HelpText<"Put global and static data smaller than the limit into a special section">;
	def msave_restore : Flag<["-"], "msave-restore">, Group<m_riscv_Features_Group>,
	HelpText<"Enable using library calls for save and restore">;
	def mno_save_restore : Flag<["-"], "mno-save-restore">, Group<m_riscv_Features_Group>,
	HelpText<"Disable using library calls for save and restore">;
	def mcmodel_EQ_medlow : Flag<["-"], "mcmodel=medlow">, Group<m_riscv_Features_Group>,
	Flags<[CC1Option]>, Alias<mcmodel_EQ>, AliasArgs<["small"]>,
	HelpText<"Equivalent to -mcmodel=small, compatible with RISC-V gcc.">;
	def mcmodel_EQ_medany : Flag<["-"], "mcmodel=medany">, Group<m_riscv_Features_Group>,
	Flags<[CC1Option]>, Alias<mcmodel_EQ>, AliasArgs<["medium"]>,
	HelpText<"Equivalent to -mcmodel=medium, compatible with RISC-V gcc.">;
	def menable_experimental_extensions : Flag<["-"], "menable-experimental-extensions">, Group<m_Group>,
	HelpText<"Enable use of experimental RISC-V extensions.">;

	def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64 only)">;
	def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Force all memory accesses to be aligned (AArch32/AArch64 only)">;
	def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>, Flags<[CC1Option,HelpHidden]>,
	HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
	def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
	def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.">;
	def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
	def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
	def ffixed_r9 : Flag<["-"], "ffixed-r9">, Group<m_arm_Features_Group>,
	HelpText<"Reserve the r9 register (ARM only)">;
	def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of movt/movw pairs (ARM only)">;
	def mcrc : Flag<["-"], "mcrc">, Group<m_Group>,
	HelpText<"Allow use of CRC instructions (ARM/Mips only)">;
	def mnocrc : Flag<["-"], "mnocrc">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of CRC instructions (ARM only)">;
	def mno_neg_immediates: Flag<["-"], "mno-neg-immediates">, Group<m_arm_Features_Group>,
	HelpText<"Disallow converting instructions with negative immediates to their negation or inversion.">;
	def mcmse : Flag<["-"], "mcmse">, Group<m_arm_Features_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Allow use of CMSE (Armv8-M Security Extensions)">;
	def ForceAAPCSBitfieldLoad : Flag<["-"], "fAAPCSBitfieldLoad">, Group<m_arm_Features_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Follows the AAPCS standard that all volatile bit-field write generates at least one load. (ARM only).">;

	def mgeneral_regs_only : Flag<["-"], "mgeneral-regs-only">, Group<m_aarch64_Features_Group>,
	HelpText<"Generate code which only uses the general purpose registers (AArch64 only)">;
	def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	foreach i = {1-31} in
	def ffixed_x#i : Flag<["-"], "ffixed-x"#i>, Group<m_Group>,
	HelpText<"Reserve the x"#i#" register (AArch64/RISC-V only)">;

	foreach i = {8-15,18} in
	def fcall_saved_x#i : Flag<["-"], "fcall-saved-x"#i>, Group<m_aarch64_Features_Group>,
	HelpText<"Make the x"#i#" register call-saved (AArch64 only)">;

	def msign_return_address_EQ : Joined<["-"], "msign-return-address=">,
	Flags<[CC1Option]>, Group<m_Group>, Values<"none,all,non-leaf">,
	HelpText<"Select return address signing scope">;
	def mbranch_protection_EQ : Joined<["-"], "mbranch-protection=">,
	HelpText<"Enforce targets of indirect branches and function returns">;

	def mharden_sls_EQ : Joined<["-"], "mharden-sls=">,
	HelpText<"Select straight-line speculation hardening scope">;

	def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
	def munimplemented_simd128 : Flag<["-"], "munimplemented-simd128">, Group<m_wasm_Features_Group>;
	def mno_unimplemented_simd128 : Flag<["-"], "mno-unimplemented-simd128">, Group<m_wasm_Features_Group>;
	def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
	def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
	def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
	def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
	def mno_sign_ext : Flag<["-"], "mno-sign-ext">, Group<m_wasm_Features_Group>;
	def mexception_handing : Flag<["-"], "mexception-handling">, Group<m_wasm_Features_Group>;
	def mno_exception_handing : Flag<["-"], "mno-exception-handling">, Group<m_wasm_Features_Group>;
	def matomics : Flag<["-"], "matomics">, Group<m_wasm_Features_Group>;
	def mno_atomics : Flag<["-"], "mno-atomics">, Group<m_wasm_Features_Group>;
	def mbulk_memory : Flag<["-"], "mbulk-memory">, Group<m_wasm_Features_Group>;
	def mno_bulk_memory : Flag<["-"], "mno-bulk-memory">, Group<m_wasm_Features_Group>;
	def mmutable_globals : Flag<["-"], "mmutable-globals">, Group<m_wasm_Features_Group>;
	def mno_mutable_globals : Flag<["-"], "mno-mutable-globals">, Group<m_wasm_Features_Group>;
	def mmultivalue : Flag<["-"], "mmultivalue">, Group<m_wasm_Features_Group>;
	def mno_multivalue : Flag<["-"], "mno-multivalue">, Group<m_wasm_Features_Group>;
	def mtail_call : Flag<["-"], "mtail-call">, Group<m_wasm_Features_Group>;
	def mno_tail_call : Flag<["-"], "mno-tail-call">, Group<m_wasm_Features_Group>;
	def mreference_types : Flag<["-"], "mreference-types">, Group<m_wasm_Features_Group>;
	def mno_reference_types : Flag<["-"], "mno-reference-types">, Group<m_wasm_Features_Group>;
	def mexec_model_EQ : Joined<["-"], "mexec-model=">, Group<m_wasm_Features_Driver_Group>,
	Values<"command,reactor">,
	HelpText<"Execution model (WebAssembly only)">;

	def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
	Flags<[HelpHidden]>,
	Group<m_Group>,
	HelpText<"Generate additional code for specified <version> of debugger ABI (AMDGPU only)">,
	MetaVarName<"<version>">;

	def mcode_object_v3 : Flag<["-"], "mcode-object-v3">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable code object v3 (AMDGPU only)">;
	def mno_code_object_v3 : Flag<["-"], "mno-code-object-v3">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable code object v3 (AMDGPU only)">;
	def mxnack : Flag<["-"], "mxnack">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable XNACK (AMDGPU only)">;
	def mno_xnack : Flag<["-"], "mno-xnack">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable XNACK (AMDGPU only)">;
	def msram_ecc : Flag<["-"], "msram-ecc">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable SRAM ECC (AMDGPU only)">;
	def mno_sram_ecc : Flag<["-"], "mno-sram-ecc">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable SRAM ECC (AMDGPU only)">;

	def mcumode : Flag<["-"], "mcumode">, Group<m_amdgpu_Features_Group>,
	HelpText<"CU wavefront execution mode is used (AMDGPU only)">;
	def mno_cumode : Flag<["-"], "mno-cumode">, Group<m_amdgpu_Features_Group>,
	HelpText<"WGP wavefront execution mode is used (AMDGPU only)">;

	def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">,
	Group<m_Group>, HelpText<"Wavefront size 64 is used">;
	def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">,
	Group<m_Group>, HelpText<"Wavefront size 32 is used">;

	def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>, Flags<[DriverOption]>;
	def maltivec : Flag<["-"], "maltivec">, Group<m_ppc_Features_Group>;
	def mno_altivec : Flag<["-"], "mno-altivec">, Group<m_ppc_Features_Group>;
	def mpcrel: Flag<["-"], "mpcrel">, Group<m_ppc_Features_Group>;
	def mno_pcrel: Flag<["-"], "mno-pcrel">, Group<m_ppc_Features_Group>;
	def mspe : Flag<["-"], "mspe">, Group<m_ppc_Features_Group>;
	def mno_spe : Flag<["-"], "mno-spe">, Group<m_ppc_Features_Group>;
	def mvsx : Flag<["-"], "mvsx">, Group<m_ppc_Features_Group>;
	def mno_vsx : Flag<["-"], "mno-vsx">, Group<m_ppc_Features_Group>;
	def msecure_plt : Flag<["-"], "msecure-plt">, Group<m_ppc_Features_Group>;
	def mpower8_vector : Flag<["-"], "mpower8-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power8_vector : Flag<["-"], "mno-power8-vector">,
	Group<m_ppc_Features_Group>;
	def mpower9_vector : Flag<["-"], "mpower9-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power9_vector : Flag<["-"], "mno-power9-vector">,
	Group<m_ppc_Features_Group>;
	def mpower10_vector : Flag<["-"], "mpower10-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power10_vector : Flag<["-"], "mno-power10-vector">,
	Group<m_ppc_Features_Group>;
	def mpower8_crypto : Flag<["-"], "mcrypto">,
	Group<m_ppc_Features_Group>;
	def mnopower8_crypto : Flag<["-"], "mno-crypto">,
	Group<m_ppc_Features_Group>;
	def mdirect_move : Flag<["-"], "mdirect-move">,
	Group<m_ppc_Features_Group>;
	def mnodirect_move : Flag<["-"], "mno-direct-move">,
	Group<m_ppc_Features_Group>;
	def mhtm : Flag<["-"], "mhtm">, Group<m_ppc_Features_Group>;
	def mno_htm : Flag<["-"], "mno-htm">, Group<m_ppc_Features_Group>;
	def mfprnd : Flag<["-"], "mfprnd">, Group<m_ppc_Features_Group>;
	def mno_fprnd : Flag<["-"], "mno-fprnd">, Group<m_ppc_Features_Group>;
	def mcmpb : Flag<["-"], "mcmpb">, Group<m_ppc_Features_Group>;
	def mno_cmpb : Flag<["-"], "mno-cmpb">, Group<m_ppc_Features_Group>;
	def misel : Flag<["-"], "misel">, Group<m_ppc_Features_Group>;
	def mno_isel : Flag<["-"], "mno-isel">, Group<m_ppc_Features_Group>;
	def mmfocrf : Flag<["-"], "mmfocrf">, Group<m_ppc_Features_Group>;
	def mmfcrf : Flag<["-"], "mmfcrf">, Alias<mmfocrf>;
	def mno_mfocrf : Flag<["-"], "mno-mfocrf">, Group<m_ppc_Features_Group>;
	def mno_mfcrf : Flag<["-"], "mno-mfcrf">, Alias<mno_mfocrf>;
	def mpopcntd : Flag<["-"], "mpopcntd">, Group<m_ppc_Features_Group>;
	def mno_popcntd : Flag<["-"], "mno-popcntd">, Group<m_ppc_Features_Group>;
	def mqpx : Flag<["-"], "mqpx">, Group<m_ppc_Features_Group>;
	def mno_qpx : Flag<["-"], "mno-qpx">, Group<m_ppc_Features_Group>;
	def mcrbits : Flag<["-"], "mcrbits">, Group<m_ppc_Features_Group>;
	def mno_crbits : Flag<["-"], "mno-crbits">, Group<m_ppc_Features_Group>;
	def minvariant_function_descriptors :
	Flag<["-"], "minvariant-function-descriptors">, Group<m_ppc_Features_Group>;
	def mno_invariant_function_descriptors :
	Flag<["-"], "mno-invariant-function-descriptors">,
	Group<m_ppc_Features_Group>;
	def mfloat128: Flag<["-"], "mfloat128">,
	Group<m_ppc_Features_Group>;
	def mno_float128 : Flag<["-"], "mno-float128">,
	Group<m_ppc_Features_Group>;
	def mlongcall: Flag<["-"], "mlongcall">,
	Group<m_ppc_Features_Group>;
	def mno_longcall : Flag<["-"], "mno-longcall">,
	Group<m_ppc_Features_Group>;
	def maix_struct_return : Flag<["-"], "maix-struct-return">,
	Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Return all structs in memory (PPC32 only)">;
	def msvr4_struct_return : Flag<["-"], "msvr4-struct-return">,
	Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Return small structs in registers (PPC32 only)">;

	def mvx : Flag<["-"], "mvx">, Group<m_Group>;
	def mno_vx : Flag<["-"], "mno-vx">, Group<m_Group>;

	defm zvector : OptInFFlag<"zvector", "Enable System z vector language extension">;
	def mzvector : Flag<["-"], "mzvector">, Alias<fzvector>;
	def mno_zvector : Flag<["-"], "mno-zvector">, Alias<fno_zvector>;

	def mbackchain : Flag<["-"], "mbackchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Link stack frames through backchain on System Z">;
	def mno_backchain : Flag<["-"], "mno-backchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>;

	def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings">, Group<m_Group>;
	def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
	def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
	HelpText<"Omit frame pointer setup for leaf functions">;
	def moslib_EQ : Joined<["-"], "moslib=">, Group<m_Group>;
	def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias<fpascal_strings>;
	def mred_zone : Flag<["-"], "mred-zone">, Group<m_Group>;
	def mtls_direct_seg_refs : Flag<["-"], "mtls-direct-seg-refs">, Group<m_Group>,
	HelpText<"Enable direct TLS access through segment registers (default)">;
	def mregparm_EQ : Joined<["-"], "mregparm=">, Group<m_Group>;
	def mrelax_all : Flag<["-"], "mrelax-all">, Group<m_Group>, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Relax all machine instructions">;
	def mincremental_linker_compatible : Flag<["-"], "mincremental-linker-compatible">, Group<m_Group>,
	Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Emit an object file which can be used with an incremental linker">;
	def mno_incremental_linker_compatible : Flag<["-"], "mno-incremental-linker-compatible">, Group<m_Group>,
	HelpText<"(integrated-as) Emit an object file which cannot be used with an incremental linker">;
	def mrtd : Flag<["-"], "mrtd">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Make StdCall calling convention the default">;
	def msmall_data_threshold_EQ : Joined <["-"], "msmall-data-threshold=">,
	Group<m_Group>, Alias<G>;
	def msoft_float : Flag<["-"], "msoft-float">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Use software floating point">;
	def mno_implicit_float : Flag<["-"], "mno-implicit-float">, Group<m_Group>,
	HelpText<"Don't generate implicit floating point instructions">;
	def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
	def mrecip : Flag<["-"], "mrecip">, Group<m_Group>;
	def mrecip_EQ : CommaJoined<["-"], "mrecip=">, Group<m_Group>, Flags<[CC1Option]>;
	def mprefer_vector_width_EQ : Joined<["-"], "mprefer-vector-width=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions.">;
	def mpie_copy_relocations : Flag<["-"], "mpie-copy-relocations">, Group<m_Group>,
	Flags<[CC1Option]>,
	HelpText<"Use copy relocations support for PIE builds">;
	def mno_pie_copy_relocations : Flag<["-"], "mno-pie-copy-relocations">, Group<m_Group>;
	def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86/SystemZ only)">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mrecord_mcount : Flag<["-"], "mrecord-mcount">, HelpText<"Generate a __mcount_loc section entry for each __fentry__ call.">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mpacked_stack : Flag<["-"], "mpacked-stack">, HelpText<"Use packed stack layout (SystemZ only).">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mno_packed_stack : Flag<["-"], "mno-packed-stack">, Flags<[CC1Option]>, Group<m_Group>;
	def mips16 : Flag<["-"], "mips16">, Group<m_mips_Features_Group>;
	def mno_mips16 : Flag<["-"], "mno-mips16">, Group<m_mips_Features_Group>;
	def mmicromips : Flag<["-"], "mmicromips">, Group<m_mips_Features_Group>;
	def mno_micromips : Flag<["-"], "mno-micromips">, Group<m_mips_Features_Group>;
	def mxgot : Flag<["-"], "mxgot">, Group<m_mips_Features_Group>;
	def mno_xgot : Flag<["-"], "mno-xgot">, Group<m_mips_Features_Group>;
	def mldc1_sdc1 : Flag<["-"], "mldc1-sdc1">, Group<m_mips_Features_Group>;
	def mno_ldc1_sdc1 : Flag<["-"], "mno-ldc1-sdc1">, Group<m_mips_Features_Group>;
	def mcheck_zero_division : Flag<["-"], "mcheck-zero-division">,
	Group<m_mips_Features_Group>;
	def mno_check_zero_division : Flag<["-"], "mno-check-zero-division">,
	Group<m_mips_Features_Group>;
	def mcompact_branches_EQ : Joined<["-"], "mcompact-branches=">,
	Group<m_mips_Features_Group>;
	def mbranch_likely : Flag<["-"], "mbranch-likely">, Group<m_Group>,
	IgnoredGCCCompat;
	def mno_branch_likely : Flag<["-"], "mno-branch-likely">, Group<m_Group>,
	IgnoredGCCCompat;
	def mindirect_jump_EQ : Joined<["-"], "mindirect-jump=">,
	Group<m_mips_Features_Group>,
	HelpText<"Change indirect jump instructions to inhibit speculation">;
	def mdsp : Flag<["-"], "mdsp">, Group<m_mips_Features_Group>;
	def mno_dsp : Flag<["-"], "mno-dsp">, Group<m_mips_Features_Group>;
	def mdspr2 : Flag<["-"], "mdspr2">, Group<m_mips_Features_Group>;
	def mno_dspr2 : Flag<["-"], "mno-dspr2">, Group<m_mips_Features_Group>;
	def msingle_float : Flag<["-"], "msingle-float">, Group<m_mips_Features_Group>;
	def mdouble_float : Flag<["-"], "mdouble-float">, Group<m_mips_Features_Group>;
	def mmadd4 : Flag<["-"], "mmadd4">, Group<m_mips_Features_Group>,
	HelpText<"Enable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mno_madd4 : Flag<["-"], "mno-madd4">, Group<m_mips_Features_Group>,
	HelpText<"Disable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mmsa : Flag<["-"], "mmsa">, Group<m_mips_Features_Group>,
	HelpText<"Enable MSA ASE (MIPS only)">;
	def mno_msa : Flag<["-"], "mno-msa">, Group<m_mips_Features_Group>,
	HelpText<"Disable MSA ASE (MIPS only)">;
	def mmt : Flag<["-"], "mmt">, Group<m_mips_Features_Group>,
	HelpText<"Enable MT ASE (MIPS only)">;
	def mno_mt : Flag<["-"], "mno-mt">, Group<m_mips_Features_Group>,
	HelpText<"Disable MT ASE (MIPS only)">;
	def mfp64 : Flag<["-"], "mfp64">, Group<m_mips_Features_Group>,
	HelpText<"Use 64-bit floating point registers (MIPS only)">;
	def mfp32 : Flag<["-"], "mfp32">, Group<m_mips_Features_Group>,
	HelpText<"Use 32-bit floating point registers (MIPS only)">;
	def mgpopt : Flag<["-"], "mgpopt">, Group<m_mips_Features_Group>,
	HelpText<"Use GP relative accesses for symbols known to be in a small"
	" data section (MIPS)">;
	def mno_gpopt : Flag<["-"], "mno-gpopt">, Group<m_mips_Features_Group>,
	HelpText<"Do not use GP relative accesses for symbols known to be in a small"
	" data section (MIPS)">;
	def mlocal_sdata : Flag<["-"], "mlocal-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Extend the -G behaviour to object local data (MIPS)">;
	def mno_local_sdata : Flag<["-"], "mno-local-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not extend the -G behaviour to object local data (MIPS)">;
	def mextern_sdata : Flag<["-"], "mextern-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Assume that externally defined data is in the small data if it"
	" meets the -G <size> threshold (MIPS)">;
	def mno_extern_sdata : Flag<["-"], "mno-extern-sdata">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not assume that externally defined data is in the small data if"
	" it meets the -G <size> threshold (MIPS)">;
	def membedded_data : Flag<["-"], "membedded-data">,
	Group<m_mips_Features_Group>,
	HelpText<"Place constants in the .rodata section instead of the .sdata "
	"section even if they meet the -G <size> threshold (MIPS)">;
	def mno_embedded_data : Flag<["-"], "mno-embedded-data">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not place constants in the .rodata section instead of the "
	".sdata if they meet the -G <size> threshold (MIPS)">;
	def mnan_EQ : Joined<["-"], "mnan=">, Group<m_mips_Features_Group>;
	def mabs_EQ : Joined<["-"], "mabs=">, Group<m_mips_Features_Group>;
	def mabicalls : Flag<["-"], "mabicalls">, Group<m_mips_Features_Group>,
	HelpText<"Enable SVR4-style position-independent code (Mips only)">;
	def mno_abicalls : Flag<["-"], "mno-abicalls">, Group<m_mips_Features_Group>,
	HelpText<"Disable SVR4-style position-independent code (Mips only)">;
	def mno_crc : Flag<["-"], "mno-crc">, Group<m_mips_Features_Group>,
	HelpText<"Disallow use of CRC instructions (Mips only)">;
	def mvirt : Flag<["-"], "mvirt">, Group<m_mips_Features_Group>;
	def mno_virt : Flag<["-"], "mno-virt">, Group<m_mips_Features_Group>;
	def mginv : Flag<["-"], "mginv">, Group<m_mips_Features_Group>;
	def mno_ginv : Flag<["-"], "mno-ginv">, Group<m_mips_Features_Group>;
	def mips1 : Flag<["-"], "mips1">,
	Alias<march_EQ>, AliasArgs<["mips1"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips1">, Flags<[HelpHidden]>;
	def mips2 : Flag<["-"], "mips2">,
	Alias<march_EQ>, AliasArgs<["mips2"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips2">, Flags<[HelpHidden]>;
	def mips3 : Flag<["-"], "mips3">,
	Alias<march_EQ>, AliasArgs<["mips3"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips3">, Flags<[HelpHidden]>;
	def mips4 : Flag<["-"], "mips4">,
	Alias<march_EQ>, AliasArgs<["mips4"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips4">, Flags<[HelpHidden]>;
	def mips5 : Flag<["-"], "mips5">,
	Alias<march_EQ>, AliasArgs<["mips5"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips5">, Flags<[HelpHidden]>;
	def mips32 : Flag<["-"], "mips32">,
	Alias<march_EQ>, AliasArgs<["mips32"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32">, Flags<[HelpHidden]>;
	def mips32r2 : Flag<["-"], "mips32r2">,
	Alias<march_EQ>, AliasArgs<["mips32r2"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r2">, Flags<[HelpHidden]>;
	def mips32r3 : Flag<["-"], "mips32r3">,
	Alias<march_EQ>, AliasArgs<["mips32r3"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r3">, Flags<[HelpHidden]>;
	def mips32r5 : Flag<["-"], "mips32r5">,
	Alias<march_EQ>, AliasArgs<["mips32r5"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r5">, Flags<[HelpHidden]>;
	def mips32r6 : Flag<["-"], "mips32r6">,
	Alias<march_EQ>, AliasArgs<["mips32r6"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips32r6">, Flags<[HelpHidden]>;
	def mips64 : Flag<["-"], "mips64">,
	Alias<march_EQ>, AliasArgs<["mips64"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64">, Flags<[HelpHidden]>;
	def mips64r2 : Flag<["-"], "mips64r2">,
	Alias<march_EQ>, AliasArgs<["mips64r2"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r2">, Flags<[HelpHidden]>;
	def mips64r3 : Flag<["-"], "mips64r3">,
	Alias<march_EQ>, AliasArgs<["mips64r3"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r3">, Flags<[HelpHidden]>;
	def mips64r5 : Flag<["-"], "mips64r5">,
	Alias<march_EQ>, AliasArgs<["mips64r5"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r5">, Flags<[HelpHidden]>;
	def mips64r6 : Flag<["-"], "mips64r6">,
	Alias<march_EQ>, AliasArgs<["mips64r6"]>, Group<m_mips_Features_Group>,
	HelpText<"Equivalent to -march=mips64r6">, Flags<[HelpHidden]>;
	def mfpxx : Flag<["-"], "mfpxx">, Group<m_mips_Features_Group>,
	HelpText<"Avoid FPU mode dependent operations when used with the O32 ABI">,
	Flags<[HelpHidden]>;
	def modd_spreg : Flag<["-"], "modd-spreg">, Group<m_mips_Features_Group>,
	HelpText<"Enable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mno_odd_spreg : Flag<["-"], "mno-odd-spreg">, Group<m_mips_Features_Group>,
	HelpText<"Disable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mrelax_pic_calls : Flag<["-"], "mrelax-pic-calls">,
	Group<m_mips_Features_Group>,
	HelpText<"Produce relaxation hints for linkers to try optimizing PIC "
	"call sequences into direct calls (MIPS only)">, Flags<[HelpHidden]>;
	def mno_relax_pic_calls : Flag<["-"], "mno-relax-pic-calls">,
	Group<m_mips_Features_Group>,
	HelpText<"Do not produce relaxation hints for linkers to try optimizing PIC "
	"call sequences into direct calls (MIPS only)">, Flags<[HelpHidden]>;
	def mglibc : Flag<["-"], "mglibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def muclibc : Flag<["-"], "muclibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def module_file_info : Flag<["-"], "module-file-info">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Provide information about a particular module file">;
	def mthumb : Flag<["-"], "mthumb">, Group<m_Group>;
	def mtune_EQ : Joined<["-"], "mtune=">, Group<m_Group>,
	HelpText<"Accepted for compatibility with GCC. Currently has no effect.">;
	def multi__module : Flag<["-"], "multi_module">;
	def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">;
	def multiply__defined : Separate<["-"], "multiply_defined">;
	def mwarn_nonportable_cfstrings : Flag<["-"], "mwarn-nonportable-cfstrings">, Group<m_Group>;
	def no_canonical_prefixes : Flag<["-"], "no-canonical-prefixes">, Flags<[HelpHidden, CoreOption]>,
	HelpText<"Use relative instead of canonical paths">;
	def no_cpp_precomp : Flag<["-"], "no-cpp-precomp">, Group<clang_ignored_f_Group>;
	def no_integrated_cpp : Flag<["-", "--"], "no-integrated-cpp">, Flags<[DriverOption]>;
	def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
	def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
	def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable builtin #include directories">;
	def nogpuinc : Flag<["-"], "nogpuinc">;
	def : Flag<["-"], "nocudainc">, Alias<nogpuinc>;
	def nogpulib : Flag<["-"], "nogpulib">,
	HelpText<"Do not link device library for CUDA/HIP device compilation">;
	def : Flag<["-"], "nocudalib">, Alias<nogpulib>;
	def nodefaultlibs : Flag<["-"], "nodefaultlibs">;
	def nofixprebinding : Flag<["-"], "nofixprebinding">;
	def nolibc : Flag<["-"], "nolibc">;
	def nomultidefs : Flag<["-"], "nomultidefs">;
	def nopie : Flag<["-"], "nopie">;
	def no_pie : Flag<["-"], "no-pie">, Alias<nopie>;
	def noprebind : Flag<["-"], "noprebind">;
	def noprofilelib : Flag<["-"], "noprofilelib">;
	def noseglinkedit : Flag<["-"], "noseglinkedit">;
	def nostartfiles : Flag<["-"], "nostartfiles">;
	def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>;
	def nostdlibinc : Flag<["-"], "nostdlibinc">;
	def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>,
	HelpText<"Disable standard #include directories for the C++ standard library">;
	def nostdlib : Flag<["-"], "nostdlib">;
	def nostdlibxx : Flag<["-"], "nostdlib++">;
	def object : Flag<["-"], "object">;
	def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>,
	HelpText<"Write output to <file>">, MetaVarName<"<file>">;
	def pagezero__size : JoinedOrSeparate<["-"], "pagezero_size">;
	def pass_exit_codes : Flag<["-", "--"], "pass-exit-codes">, Flags<[Unsupported]>;
	def pedantic_errors : Flag<["-", "--"], "pedantic-errors">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pedantic : Flag<["-", "--"], "pedantic">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, Flags<[CC1Option]>;
	def pipe : Flag<["-", "--"], "pipe">,
	HelpText<"Use pipes between commands, when possible">;
	def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
	def prebind : Flag<["-"], "prebind">;
	def preload : Flag<["-"], "preload">;
	def print_file_name_EQ : Joined<["-", "--"], "print-file-name=">,
	HelpText<"Print the full library path of <file>">, MetaVarName<"<file>">;
	def print_ivar_layout : Flag<["-"], "print-ivar-layout">, Flags<[CC1Option]>,
	HelpText<"Enable Objective-C Ivar layout bitmap print trace">;
	def print_libgcc_file_name : Flag<["-", "--"], "print-libgcc-file-name">,
	HelpText<"Print the library path for the currently used compiler runtime "
	"library (\"libgcc.a\" or \"libclang_rt.builtins.*.a\")">;
	def print_multi_directory : Flag<["-", "--"], "print-multi-directory">;
	def print_multi_lib : Flag<["-", "--"], "print-multi-lib">;
	def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">,
	Flags<[Unsupported]>;
	def print_target_triple : Flag<["-", "--"], "print-target-triple">,
	HelpText<"Print the normalized target triple">;
	def print_effective_triple : Flag<["-", "--"], "print-effective-triple">,
	HelpText<"Print the effective target triple">;
	def print_prog_name_EQ : Joined<["-", "--"], "print-prog-name=">,
	HelpText<"Print the full program path of <name>">, MetaVarName<"<name>">;
	def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
	HelpText<"Print the resource directory pathname">;
	def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
	HelpText<"Print the paths used for finding libraries and programs">;
	def print_targets : Flag<["-", "--"], "print-targets">,
	HelpText<"Print the registered targets">;
	def private__bundle : Flag<["-"], "private_bundle">;
	def pthreads : Flag<["-"], "pthreads">;
	def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>,
	HelpText<"Support POSIX threads in generated code">;
	def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>;
	def p : Flag<["-"], "p">;
	def pie : Flag<["-"], "pie">;
	def static_pie : Flag<["-"], "static-pie">;
	def read__only__relocs : Separate<["-"], "read_only_relocs">;
	def remap : Flag<["-"], "remap">;
	def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Rewrite Objective-C source to C++">, Group<Action_Group>;
	def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>,
	HelpText<"Rewrite Legacy Objective-C source to C++">;
	def rdynamic : Flag<["-"], "rdynamic">;
	def resource_dir : Separate<["-"], "resource-dir">,
	Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>,
	HelpText<"The directory which holds the compiler resource files">;
	def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[DriverOption, CoreOption]>,
	Alias<resource_dir>;
	def rpath : Separate<["-"], "rpath">, Flags<[LinkerInput]>, Group<Link_Group>;
	def rtlib_EQ : Joined<["-", "--"], "rtlib=">,
	HelpText<"Compiler runtime library to use">;
	def frtlib_add_rpath: Flag<["-"], "frtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Add -rpath with architecture-specific resource directory to the linker flags">;
	def fno_rtlib_add_rpath: Flag<["-"], "fno-rtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Do not add -rpath with architecture-specific resource directory to the linker flags">;
	def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
	Group<Link_Group>;
	def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[CC1Option, DriverOption]>,
	HelpText<"Save intermediate compilation results.">;
	def save_temps : Flag<["-", "--"], "save-temps">, Flags<[DriverOption]>,
	Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save intermediate compilation results">;
	def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[DriverOption]>,
	HelpText<"Save llvm statistics.">;
	def save_stats : Flag<["-", "--"], "save-stats">, Flags<[DriverOption]>,
	Alias<save_stats_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save llvm statistics.">;
	def via_file_asm : Flag<["-", "--"], "via-file-asm">, InternalDebugOpt,
	HelpText<"Write assembly to file for input to assemble jobs">;
	def sectalign : MultiArg<["-"], "sectalign", 3>;
	def sectcreate : MultiArg<["-"], "sectcreate", 3>;
	def sectobjectsymbols : MultiArg<["-"], "sectobjectsymbols", 2>;
	def sectorder : MultiArg<["-"], "sectorder", 3>;
	def seg1addr : JoinedOrSeparate<["-"], "seg1addr">;
	def seg__addr__table__filename : Separate<["-"], "seg_addr_table_filename">;
	def seg__addr__table : Separate<["-"], "seg_addr_table">;
	def segaddr : MultiArg<["-"], "segaddr", 2>;
	def segcreate : MultiArg<["-"], "segcreate", 3>;
	def seglinkedit : Flag<["-"], "seglinkedit">;
	def segprot : MultiArg<["-"], "segprot", 3>;
	def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">;
	def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">;
	def segs__read__ : Joined<["-"], "segs_read_">;
	def shared_libgcc : Flag<["-"], "shared-libgcc">;
	def shared : Flag<["-", "--"], "shared">;
	def single__module : Flag<["-"], "single_module">;
	def specs_EQ : Joined<["-", "--"], "specs=">;
	def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>;
	def static_libgcc : Flag<["-"], "static-libgcc">;
	def static_libstdcxx : Flag<["-"], "static-libstdc++">;
	def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>;
	def std_default_EQ : Joined<["-"], "std-default=">;
	def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>,
	Group<CompileOnly_Group>, HelpText<"Language standard to compile for">,
	ValuesCode<[{
	const char *Values =
	#define LANGSTANDARD(id, name, lang, desc, features) name ","
	#define LANGSTANDARD_ALIAS(id, alias) alias ","
	#include "clang/Basic/LangStandards.def"
	;
	}]>;
	def stdlib_EQ : Joined<["-", "--"], "stdlib=">, Flags<[CC1Option]>,
	HelpText<"C++ standard library to use">, Values<"libc++,libstdc++,platform">;
	def stdlibxx_isystem : JoinedOrSeparate<["-"], "stdlib++-isystem">,
	Group<clang_i_Group>,
	HelpText<"Use directory as the C++ standard library include path">,
	Flags<[DriverOption]>, MetaVarName<"<directory>">;
	def unwindlib_EQ : Joined<["-", "--"], "unwindlib=">, Flags<[CC1Option]>,
	HelpText<"Unwind library to use">, Values<"libgcc,unwindlib,platform">;
	def sub__library : JoinedOrSeparate<["-"], "sub_library">;
	def sub__umbrella : JoinedOrSeparate<["-"], "sub_umbrella">;
	def system_header_prefix : Joined<["--"], "system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as including a "
	"system header.">;
	def : Separate<["--"], "system-header-prefix">, Alias<system_header_prefix>;
	def no_system_header_prefix : Joined<["--"], "no-system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as not including a "
	"system header.">;
	def : Separate<["--"], "no-system-header-prefix">, Alias<no_system_header_prefix>;
	def s : Flag<["-"], "s">, Group<Link_Group>;
	def target : Joined<["--"], "target=">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Generate code for the given target">;
	def print_supported_cpus : Flag<["-", "--"], "print-supported-cpus">,
	Group<CompileOnly_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Print supported cpu models for the given target (if target is not specified,"
	" it will print the supported cpus for the default target)">;
	def mcpu_EQ_QUESTION : Flag<["-"], "mcpu=?">, Alias<print_supported_cpus>;
	def mtune_EQ_QUESTION : Flag<["-"], "mtune=?">, Alias<print_supported_cpus>;
	def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[DriverOption]>,
	HelpText<"Use the gcc toolchain at the given directory">;
	def time : Flag<["-"], "time">,
	HelpText<"Time individual commands">;
	def traditional_cpp : Flag<["-", "--"], "traditional-cpp">, Flags<[CC1Option]>,
	HelpText<"Enable some traditional CPP emulation">;
	def traditional : Flag<["-", "--"], "traditional">;
	def trigraphs : Flag<["-", "--"], "trigraphs">, Alias<ftrigraphs>,
	HelpText<"Process trigraph sequences">;
	def twolevel__namespace__hints : Flag<["-"], "twolevel_namespace_hints">;
	def twolevel__namespace : Flag<["-"], "twolevel_namespace">;
	def t : Flag<["-"], "t">, Group<Link_Group>;
	def umbrella : Separate<["-"], "umbrella">;
	def undefined : JoinedOrSeparate<["-"], "undefined">, Group<u_Group>;
	def undef : Flag<["-"], "undef">, Group<u_Group>, Flags<[CC1Option]>,
	HelpText<"undef all system defines">;
	def unexported__symbols__list : Separate<["-"], "unexported_symbols_list">;
	def u : JoinedOrSeparate<["-"], "u">, Group<u_Group>;
	def v : Flag<["-"], "v">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Show commands to run and use verbose output">;
	def verify_debug_info : Flag<["--"], "verify-debug-info">, Flags<[DriverOption]>,
	HelpText<"Verify the binary representation of debug output">;
	def weak_l : Joined<["-"], "weak-l">, Flags<[LinkerInput]>;
	def weak__framework : Separate<["-"], "weak_framework">, Flags<[LinkerInput]>;
	def weak__library : Separate<["-"], "weak_library">, Flags<[LinkerInput]>;
	def weak__reference__mismatches : Separate<["-"], "weak_reference_mismatches">;
	def whatsloaded : Flag<["-"], "whatsloaded">;
	def whyload : Flag<["-"], "whyload">;
	def w : Flag<["-"], "w">, HelpText<"Suppress all warnings">, Flags<[CC1Option]>;
	def x : JoinedOrSeparate<["-"], "x">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Treat subsequent input files as having type <language>">,
	MetaVarName<"<language>">;
	def y : Joined<["-"], "y">;

	defm integrated_as : OptOutFFlag<"integrated-as", "Enable the integrated assembler", "Disable the integrated assembler">;

	def fintegrated_cc1 : Flag<["-"], "fintegrated-cc1">,
	Flags<[CoreOption, DriverOption]>, Group<f_Group>,
	HelpText<"Run cc1 in-process">;
	def fno_integrated_cc1 : Flag<["-"], "fno-integrated-cc1">,
	Flags<[CoreOption, DriverOption]>, Group<f_Group>,
	HelpText<"Spawn a separate process for each cc1">;

	def : Flag<["-"], "integrated-as">, Alias<fintegrated_as>, Flags<[DriverOption]>;
	def : Flag<["-"], "no-integrated-as">, Alias<fno_integrated_as>,
	Flags<[CC1Option, DriverOption]>;

	def working_directory : JoinedOrSeparate<["-"], "working-directory">, Flags<[CC1Option]>,
	HelpText<"Resolve file paths relative to the specified directory">;
	def working_directory_EQ : Joined<["-"], "working-directory=">, Flags<[CC1Option]>,
	Alias<working_directory>;

	// Double dash options, which are usually an alias for one of the previous
	// options.

	def _mhwdiv_EQ : Joined<["--"], "mhwdiv=">, Alias<mhwdiv_EQ>;
	def _mhwdiv : Separate<["--"], "mhwdiv">, Alias<mhwdiv_EQ>;
	def _CLASSPATH_EQ : Joined<["--"], "CLASSPATH=">, Alias<fclasspath_EQ>;
	def _CLASSPATH : Separate<["--"], "CLASSPATH">, Alias<fclasspath_EQ>;
	def _all_warnings : Flag<["--"], "all-warnings">, Alias<Wall>;
	def _analyzer_no_default_checks : Flag<["--"], "analyzer-no-default-checks">, Flags<[DriverOption]>;
	def _analyzer_output : JoinedOrSeparate<["--"], "analyzer-output">, Flags<[DriverOption]>,
	HelpText<"Static analyzer report output format (html\|plist\|plist-multi-file\|plist-html\|sarif\|text).">;
	def _analyze : Flag<["--"], "analyze">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Run the static analyzer">;
	def _assemble : Flag<["--"], "assemble">, Alias<S>;
	def _assert_EQ : Joined<["--"], "assert=">, Alias<A>;
	def _assert : Separate<["--"], "assert">, Alias<A>;
	def _bootclasspath_EQ : Joined<["--"], "bootclasspath=">, Alias<fbootclasspath_EQ>;
	def _bootclasspath : Separate<["--"], "bootclasspath">, Alias<fbootclasspath_EQ>;
	def _classpath_EQ : Joined<["--"], "classpath=">, Alias<fclasspath_EQ>;
	def _classpath : Separate<["--"], "classpath">, Alias<fclasspath_EQ>;
	def _comments_in_macros : Flag<["--"], "comments-in-macros">, Alias<CC>;
	def _comments : Flag<["--"], "comments">, Alias<C>;
	def _compile : Flag<["--"], "compile">, Alias<c>;
	def _constant_cfstrings : Flag<["--"], "constant-cfstrings">;
	def _debug_EQ : Joined<["--"], "debug=">, Alias<g_Flag>;
	def _debug : Flag<["--"], "debug">, Alias<g_Flag>;
	def _define_macro_EQ : Joined<["--"], "define-macro=">, Alias<D>;
	def _define_macro : Separate<["--"], "define-macro">, Alias<D>;
	def _dependencies : Flag<["--"], "dependencies">, Alias<M>;
	def _dyld_prefix_EQ : Joined<["--"], "dyld-prefix=">;
	def _dyld_prefix : Separate<["--"], "dyld-prefix">, Alias<_dyld_prefix_EQ>;
	def _encoding_EQ : Joined<["--"], "encoding=">, Alias<fencoding_EQ>;
	def _encoding : Separate<["--"], "encoding">, Alias<fencoding_EQ>;
	def _entry : Flag<["--"], "entry">, Alias<e>;
	def _extdirs_EQ : Joined<["--"], "extdirs=">, Alias<fextdirs_EQ>;
	def _extdirs : Separate<["--"], "extdirs">, Alias<fextdirs_EQ>;
	def _extra_warnings : Flag<["--"], "extra-warnings">, Alias<W_Joined>;
	def _for_linker_EQ : Joined<["--"], "for-linker=">, Alias<Xlinker>;
	def _for_linker : Separate<["--"], "for-linker">, Alias<Xlinker>;
	def _force_link_EQ : Joined<["--"], "force-link=">, Alias<u>;
	def _force_link : Separate<["--"], "force-link">, Alias<u>;
	def _help_hidden : Flag<["--"], "help-hidden">,
	HelpText<"Display help for hidden options">;
	def _imacros_EQ : Joined<["--"], "imacros=">, Alias<imacros>;
	def _include_barrier : Flag<["--"], "include-barrier">, Alias<I_>;
	def _include_directory_after_EQ : Joined<["--"], "include-directory-after=">, Alias<idirafter>;
	def _include_directory_after : Separate<["--"], "include-directory-after">, Alias<idirafter>;
	def _include_directory_EQ : Joined<["--"], "include-directory=">, Alias<I>;
	def _include_directory : Separate<["--"], "include-directory">, Alias<I>;
	def _include_prefix_EQ : Joined<["--"], "include-prefix=">, Alias<iprefix>;
	def _include_prefix : Separate<["--"], "include-prefix">, Alias<iprefix>;
	def _include_with_prefix_after_EQ : Joined<["--"], "include-with-prefix-after=">, Alias<iwithprefix>;
	def _include_with_prefix_after : Separate<["--"], "include-with-prefix-after">, Alias<iwithprefix>;
	def _include_with_prefix_before_EQ : Joined<["--"], "include-with-prefix-before=">, Alias<iwithprefixbefore>;
	def _include_with_prefix_before : Separate<["--"], "include-with-prefix-before">, Alias<iwithprefixbefore>;
	def _include_with_prefix_EQ : Joined<["--"], "include-with-prefix=">, Alias<iwithprefix>;
	def _include_with_prefix : Separate<["--"], "include-with-prefix">, Alias<iwithprefix>;
	def _include_EQ : Joined<["--"], "include=">, Alias<include_>;
	def _language_EQ : Joined<["--"], "language=">, Alias<x>;
	def _language : Separate<["--"], "language">, Alias<x>;
	def _library_directory_EQ : Joined<["--"], "library-directory=">, Alias<L>;
	def _library_directory : Separate<["--"], "library-directory">, Alias<L>;
	def _no_line_commands : Flag<["--"], "no-line-commands">, Alias<P>;
	def _no_standard_includes : Flag<["--"], "no-standard-includes">, Alias<nostdinc>;
	def _no_standard_libraries : Flag<["--"], "no-standard-libraries">, Alias<nostdlib>;
	def _no_undefined : Flag<["--"], "no-undefined">, Flags<[LinkerInput]>;
	def _no_warnings : Flag<["--"], "no-warnings">, Alias<w>;
	def _optimize_EQ : Joined<["--"], "optimize=">, Alias<O>;
	def _optimize : Flag<["--"], "optimize">, Alias<O>;
	def _output_class_directory_EQ : Joined<["--"], "output-class-directory=">, Alias<foutput_class_dir_EQ>;
	def _output_class_directory : Separate<["--"], "output-class-directory">, Alias<foutput_class_dir_EQ>;
	def _output_EQ : Joined<["--"], "output=">, Alias<o>;
	def _output : Separate<["--"], "output">, Alias<o>;
	def _param : Separate<["--"], "param">, Group<CompileOnly_Group>;
	def _param_EQ : Joined<["--"], "param=">, Alias<_param>;
	def _precompile : Flag<["--"], "precompile">, Flags<[DriverOption]>,
	Group<Action_Group>, HelpText<"Only precompile the input">;
	def _prefix_EQ : Joined<["--"], "prefix=">, Alias<B>;
	def _prefix : Separate<["--"], "prefix">, Alias<B>;
	def _preprocess : Flag<["--"], "preprocess">, Alias<E>;
	def _print_diagnostic_categories : Flag<["--"], "print-diagnostic-categories">;
	def _print_file_name : Separate<["--"], "print-file-name">, Alias<print_file_name_EQ>;
	def _print_missing_file_dependencies : Flag<["--"], "print-missing-file-dependencies">, Alias<MG>;
	def _print_prog_name : Separate<["--"], "print-prog-name">, Alias<print_prog_name_EQ>;
	def _profile_blocks : Flag<["--"], "profile-blocks">, Alias<a>;
	def _profile : Flag<["--"], "profile">, Alias<p>;
	def _resource_EQ : Joined<["--"], "resource=">, Alias<fcompile_resource_EQ>;
	def _resource : Separate<["--"], "resource">, Alias<fcompile_resource_EQ>;
	def _rtlib : Separate<["--"], "rtlib">, Alias<rtlib_EQ>;
	def _serialize_diags : Separate<["-", "--"], "serialize-diagnostics">, Flags<[DriverOption]>,
	HelpText<"Serialize compiler diagnostics to a file">;
	// We give --version different semantics from -version.
	def _version : Flag<["--"], "version">, Flags<[CoreOption, CC1Option]>,
	HelpText<"Print version information">;
	def _signed_char : Flag<["--"], "signed-char">, Alias<fsigned_char>;
	def _std : Separate<["--"], "std">, Alias<std_EQ>;
	def _stdlib : Separate<["--"], "stdlib">, Alias<stdlib_EQ>;
	def _sysroot_EQ : Joined<["--"], "sysroot=">;
	def _sysroot : Separate<["--"], "sysroot">, Alias<_sysroot_EQ>;
	def _target_help : Flag<["--"], "target-help">;
	def _trace_includes : Flag<["--"], "trace-includes">, Alias<H>;
	def _undefine_macro_EQ : Joined<["--"], "undefine-macro=">, Alias<U>;
	def _undefine_macro : Separate<["--"], "undefine-macro">, Alias<U>;
	def _unsigned_char : Flag<["--"], "unsigned-char">, Alias<funsigned_char>;
	def _user_dependencies : Flag<["--"], "user-dependencies">, Alias<MM>;
	def _verbose : Flag<["--"], "verbose">, Alias<v>;
	def _warn__EQ : Joined<["--"], "warn-=">, Alias<W_Joined>;
	def _warn_ : Joined<["--"], "warn-">, Alias<W_Joined>;
	def _write_dependencies : Flag<["--"], "write-dependencies">, Alias<MD>;
	def _write_user_dependencies : Flag<["--"], "write-user-dependencies">, Alias<MMD>;
	def _ : Joined<["--"], "">, Flags<[Unsupported]>;

	// Hexagon feature flags.
	def mieee_rnd_near : Flag<["-"], "mieee-rnd-near">,
	Group<m_hexagon_Features_Group>;
	def mv5 : Flag<["-"], "mv5">, Group<m_hexagon_Features_Group>, Alias<mcpu_EQ>,
	AliasArgs<["hexagonv5"]>;
	def mv55 : Flag<["-"], "mv55">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv55"]>;
	def mv60 : Flag<["-"], "mv60">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv60"]>;
	def mv62 : Flag<["-"], "mv62">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv62"]>;
	def mv65 : Flag<["-"], "mv65">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv65"]>;
	def mv66 : Flag<["-"], "mv66">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv66"]>;
	def mv67 : Flag<["-"], "mv67">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv67"]>;
	def mv67t : Flag<["-"], "mv67t">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv67t"]>;
	def mhexagon_hvx : Flag<["-"], "mhvx">, Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Vector eXtensions">;
	def mhexagon_hvx_EQ : Joined<["-"], "mhvx=">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Vector eXtensions">;
	def mno_hexagon_hvx : Flag<["-"], "mno-hvx">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Disable Hexagon Vector eXtensions">;
	def mhexagon_hvx_length_EQ : Joined<["-"], "mhvx-length=">,
	Group<m_hexagon_Features_HVX_Group>, HelpText<"Set Hexagon Vector Length">,
	Values<"64B,128B">;
	def ffixed_r19: Flag<["-"], "ffixed-r19">,
	HelpText<"Reserve register r19 (Hexagon only)">;
	def mmemops : Flag<["-"], "mmemops">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of memop instructions">;
	def mno_memops : Flag<["-"], "mno-memops">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of memop instructions">;
	def mpackets : Flag<["-"], "mpackets">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of instruction packets">;
	def mno_packets : Flag<["-"], "mno-packets">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of instruction packets">;
	def mnvj : Flag<["-"], "mnvj">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of new-value jumps">;
	def mno_nvj : Flag<["-"], "mno-nvj">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of new-value jumps">;
	def mnvs : Flag<["-"], "mnvs">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable generation of new-value stores">;
	def mno_nvs : Flag<["-"], "mno-nvs">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable generation of new-value stores">;


	// X86 feature flags
	def mx87 : Flag<["-"], "mx87">, Group<m_x86_Features_Group>;
	def mno_x87 : Flag<["-"], "mno-x87">, Group<m_x86_Features_Group>;
	def m80387 : Flag<["-"], "m80387">, Alias<mx87>;
	def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
	def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
	def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
	def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
	def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
	def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
	def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
	def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
	def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
	def mtamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
	def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
	def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
	def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
	def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
	def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
	def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
	def mno_sse2 : Flag<["-"], "mno-sse2">, Group<m_x86_Features_Group>;
	def msse3 : Flag<["-"], "msse3">, Group<m_x86_Features_Group>;
	def mno_sse3 : Flag<["-"], "mno-sse3">, Group<m_x86_Features_Group>;
	def mssse3 : Flag<["-"], "mssse3">, Group<m_x86_Features_Group>;
	def mno_ssse3 : Flag<["-"], "mno-ssse3">, Group<m_x86_Features_Group>;
	def msse4_1 : Flag<["-"], "msse4.1">, Group<m_x86_Features_Group>;
	def mno_sse4_1 : Flag<["-"], "mno-sse4.1">, Group<m_x86_Features_Group>;
	def msse4_2 : Flag<["-"], "msse4.2">, Group<m_x86_Features_Group>;
	def mno_sse4_2 : Flag<["-"], "mno-sse4.2">, Group<m_x86_Features_Group>;
	def msse4 : Flag<["-"], "msse4">, Alias<msse4_2>;
	// -mno-sse4 turns off sse4.1 which has the effect of turning off everything
	// later than 4.1. -msse4 turns on 4.2 which has the effect of turning on
	// everything earlier than 4.2.
	def mno_sse4 : Flag<["-"], "mno-sse4">, Alias<mno_sse4_1>;
	def msse4a : Flag<["-"], "msse4a">, Group<m_x86_Features_Group>;
	def mno_sse4a : Flag<["-"], "mno-sse4a">, Group<m_x86_Features_Group>;
	def mavx : Flag<["-"], "mavx">, Group<m_x86_Features_Group>;
	def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
	def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
	def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
	def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
	def mno_avx512f : Flag<["-"], "mno-avx512f">, Group<m_x86_Features_Group>;
	def mavx512bf16 : Flag<["-"], "mavx512bf16">, Group<m_x86_Features_Group>;
	def mno_avx512bf16 : Flag<["-"], "mno-avx512bf16">, Group<m_x86_Features_Group>;
	def mavx512bitalg : Flag<["-"], "mavx512bitalg">, Group<m_x86_Features_Group>;
	def mno_avx512bitalg : Flag<["-"], "mno-avx512bitalg">, Group<m_x86_Features_Group>;
	def mavx512bw : Flag<["-"], "mavx512bw">, Group<m_x86_Features_Group>;
	def mno_avx512bw : Flag<["-"], "mno-avx512bw">, Group<m_x86_Features_Group>;
	def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
	def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
	def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
	def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
	def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
	def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
	def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
	def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
	def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
	def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
	def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
	def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi2 : Flag<["-"], "mno-avx512vbmi2">, Group<m_x86_Features_Group>;
	def mavx512vl : Flag<["-"], "mavx512vl">, Group<m_x86_Features_Group>;
	def mno_avx512vl : Flag<["-"], "mno-avx512vl">, Group<m_x86_Features_Group>;
	def mavx512vnni : Flag<["-"], "mavx512vnni">, Group<m_x86_Features_Group>;
	def mno_avx512vnni : Flag<["-"], "mno-avx512vnni">, Group<m_x86_Features_Group>;
	def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
	def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
	def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
	def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
	def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
	def mno_aes : Flag<["-"], "mno-aes">, Group<m_x86_Features_Group>;
	def mbmi : Flag<["-"], "mbmi">, Group<m_x86_Features_Group>;
	def mno_bmi : Flag<["-"], "mno-bmi">, Group<m_x86_Features_Group>;
	def mbmi2 : Flag<["-"], "mbmi2">, Group<m_x86_Features_Group>;
	def mno_bmi2 : Flag<["-"], "mno-bmi2">, Group<m_x86_Features_Group>;
	def mcldemote : Flag<["-"], "mcldemote">, Group<m_x86_Features_Group>;
	def mno_cldemote : Flag<["-"], "mno-cldemote">, Group<m_x86_Features_Group>;
	def mclflushopt : Flag<["-"], "mclflushopt">, Group<m_x86_Features_Group>;
	def mno_clflushopt : Flag<["-"], "mno-clflushopt">, Group<m_x86_Features_Group>;
	def mclwb : Flag<["-"], "mclwb">, Group<m_x86_Features_Group>;
	def mno_clwb : Flag<["-"], "mno-clwb">, Group<m_x86_Features_Group>;
	def mwbnoinvd : Flag<["-"], "mwbnoinvd">, Group<m_x86_Features_Group>;
	def mno_wbnoinvd : Flag<["-"], "mno-wbnoinvd">, Group<m_x86_Features_Group>;
	def mclzero : Flag<["-"], "mclzero">, Group<m_x86_Features_Group>;
	def mno_clzero : Flag<["-"], "mno-clzero">, Group<m_x86_Features_Group>;
	def mcx16 : Flag<["-"], "mcx16">, Group<m_x86_Features_Group>;
	def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
	def menqcmd : Flag<["-"], "menqcmd">, Group<m_x86_Features_Group>;
	def mno_enqcmd : Flag<["-"], "mno-enqcmd">, Group<m_x86_Features_Group>;
	def mf16c : Flag<["-"], "mf16c">, Group<m_x86_Features_Group>;
	def mno_f16c : Flag<["-"], "mno-f16c">, Group<m_x86_Features_Group>;
	def mfma : Flag<["-"], "mfma">, Group<m_x86_Features_Group>;
	def mno_fma : Flag<["-"], "mno-fma">, Group<m_x86_Features_Group>;
	def mfma4 : Flag<["-"], "mfma4">, Group<m_x86_Features_Group>;
	def mno_fma4 : Flag<["-"], "mno-fma4">, Group<m_x86_Features_Group>;
	def mfsgsbase : Flag<["-"], "mfsgsbase">, Group<m_x86_Features_Group>;
	def mno_fsgsbase : Flag<["-"], "mno-fsgsbase">, Group<m_x86_Features_Group>;
	def mfxsr : Flag<["-"], "mfxsr">, Group<m_x86_Features_Group>;
	def mno_fxsr : Flag<["-"], "mno-fxsr">, Group<m_x86_Features_Group>;
	def minvpcid : Flag<["-"], "minvpcid">, Group<m_x86_Features_Group>;
	def mno_invpcid : Flag<["-"], "mno-invpcid">, Group<m_x86_Features_Group>;
	def mgfni : Flag<["-"], "mgfni">, Group<m_x86_Features_Group>;
	def mno_gfni : Flag<["-"], "mno-gfni">, Group<m_x86_Features_Group>;
	def mlwp : Flag<["-"], "mlwp">, Group<m_x86_Features_Group>;
	def mno_lwp : Flag<["-"], "mno-lwp">, Group<m_x86_Features_Group>;
	def mlzcnt : Flag<["-"], "mlzcnt">, Group<m_x86_Features_Group>;
	def mno_lzcnt : Flag<["-"], "mno-lzcnt">, Group<m_x86_Features_Group>;
	def mmovbe : Flag<["-"], "mmovbe">, Group<m_x86_Features_Group>;
	def mno_movbe : Flag<["-"], "mno-movbe">, Group<m_x86_Features_Group>;
	def mmovdiri : Flag<["-"], "mmovdiri">, Group<m_x86_Features_Group>;
	def mno_movdiri : Flag<["-"], "mno-movdiri">, Group<m_x86_Features_Group>;
	def mmovdir64b : Flag<["-"], "mmovdir64b">, Group<m_x86_Features_Group>;
	def mno_movdir64b : Flag<["-"], "mno-movdir64b">, Group<m_x86_Features_Group>;
	def mmwaitx : Flag<["-"], "mmwaitx">, Group<m_x86_Features_Group>;
	def mno_mwaitx : Flag<["-"], "mno-mwaitx">, Group<m_x86_Features_Group>;
	def mpku : Flag<["-"], "mpku">, Group<m_x86_Features_Group>;
	def mno_pku : Flag<["-"], "mno-pku">, Group<m_x86_Features_Group>;
	def mpclmul : Flag<["-"], "mpclmul">, Group<m_x86_Features_Group>;
	def mno_pclmul : Flag<["-"], "mno-pclmul">, Group<m_x86_Features_Group>;
	def mpconfig : Flag<["-"], "mpconfig">, Group<m_x86_Features_Group>;
	def mno_pconfig : Flag<["-"], "mno-pconfig">, Group<m_x86_Features_Group>;
	def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
	def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
	def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
	def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
	def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
	def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
	def mptwrite : Flag<["-"], "mptwrite">, Group<m_x86_Features_Group>;
	def mno_ptwrite : Flag<["-"], "mno-ptwrite">, Group<m_x86_Features_Group>;
	def mrdpid : Flag<["-"], "mrdpid">, Group<m_x86_Features_Group>;
	def mno_rdpid : Flag<["-"], "mno-rdpid">, Group<m_x86_Features_Group>;
	def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
	def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
	def mrtm : Flag<["-"], "mrtm">, Group<m_x86_Features_Group>;
	def mno_rtm : Flag<["-"], "mno-rtm">, Group<m_x86_Features_Group>;
	def mrdseed : Flag<["-"], "mrdseed">, Group<m_x86_Features_Group>;
	def mno_rdseed : Flag<["-"], "mno-rdseed">, Group<m_x86_Features_Group>;
	def msahf : Flag<["-"], "msahf">, Group<m_x86_Features_Group>;
	def mno_sahf : Flag<["-"], "mno-sahf">, Group<m_x86_Features_Group>;
	def mserialize : Flag<["-"], "mserialize">, Group<m_x86_Features_Group>;
	def mno_serialize : Flag<["-"], "mno-serialize">, Group<m_x86_Features_Group>;
	def msgx : Flag<["-"], "msgx">, Group<m_x86_Features_Group>;
	def mno_sgx : Flag<["-"], "mno-sgx">, Group<m_x86_Features_Group>;
	def msha : Flag<["-"], "msha">, Group<m_x86_Features_Group>;
	def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
	def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
	def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
	def mtsxldtrk : Flag<["-"], "mtsxldtrk">, Group<m_x86_Features_Group>;
	def mno_tsxldtrk : Flag<["-"], "mno-tsxldtrk">, Group<m_x86_Features_Group>;
	def mvaes : Flag<["-"], "mvaes">, Group<m_x86_Features_Group>;
	def mno_vaes : Flag<["-"], "mno-vaes">, Group<m_x86_Features_Group>;
	def mvpclmulqdq : Flag<["-"], "mvpclmulqdq">, Group<m_x86_Features_Group>;
	def mno_vpclmulqdq : Flag<["-"], "mno-vpclmulqdq">, Group<m_x86_Features_Group>;
	def mwaitpkg : Flag<["-"], "mwaitpkg">, Group<m_x86_Features_Group>;
	def mno_waitpkg : Flag<["-"], "mno-waitpkg">, Group<m_x86_Features_Group>;
	def mxop : Flag<["-"], "mxop">, Group<m_x86_Features_Group>;
	def mno_xop : Flag<["-"], "mno-xop">, Group<m_x86_Features_Group>;
	def mxsave : Flag<["-"], "mxsave">, Group<m_x86_Features_Group>;
	def mno_xsave : Flag<["-"], "mno-xsave">, Group<m_x86_Features_Group>;
	def mxsavec : Flag<["-"], "mxsavec">, Group<m_x86_Features_Group>;
	def mno_xsavec : Flag<["-"], "mno-xsavec">, Group<m_x86_Features_Group>;
	def mxsaveopt : Flag<["-"], "mxsaveopt">, Group<m_x86_Features_Group>;
	def mno_xsaveopt : Flag<["-"], "mno-xsaveopt">, Group<m_x86_Features_Group>;
	def mxsaves : Flag<["-"], "mxsaves">, Group<m_x86_Features_Group>;
	def mno_xsaves : Flag<["-"], "mno-xsaves">, Group<m_x86_Features_Group>;
	def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
	def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
	def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<m_x86_Features_Group>;
	def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;
	def mvzeroupper : Flag<["-"], "mvzeroupper">, Group<m_x86_Features_Group>;
	def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group<m_x86_Features_Group>;

	// These are legacy user-facing driver-level option spellings. They are always
	// aliases for options that are spelled using the more common Unix / GNU flag
	// style of double-dash and equals-joined flags.
	def gcc_toolchain_legacy_spelling : Separate<["-"], "gcc-toolchain">, Alias<gcc_toolchain>;
	def target_legacy_spelling : Separate<["-"], "target">, Alias<target>;

	// Special internal option to handle -Xlinker --no-demangle.
	def Z_Xlinker__no_demangle : Flag<["-"], "Z-Xlinker-no-demangle">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Special internal option to allow forwarding arbitrary arguments to linker.
	def Zlinker_input : Separate<["-"], "Zlinker-input">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Reserved library options.
	def Z_reserved_lib_stdcxx : Flag<["-"], "Z-reserved-lib-stdc++">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
	def Z_reserved_lib_cckext : Flag<["-"], "Z-reserved-lib-cckext">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;

	// Ignored options
	multiclass BooleanFFlag<string name> {
	def f#NAME : Flag<["-"], "f"#name>;
	def fno_#NAME : Flag<["-"], "fno-"#name>;
	}

	defm : BooleanFFlag<"keep-inline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;

	def fprofile_dir : Joined<["-"], "fprofile-dir=">, Group<f_Group>;

	def fuse_ld_EQ : Joined<["-"], "fuse-ld=">, Group<f_Group>, Flags<[CoreOption]>;

	defm align_labels : BooleanFFlag<"align-labels">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_loops : BooleanFFlag<"align-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_loops_EQ : Joined<["-"], "falign-loops=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_jumps : BooleanFFlag<"align-jumps">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_jumps_EQ : Joined<["-"], "falign-jumps=">, Group<clang_ignored_gcc_optimization_f_Group>;

	// FIXME: This option should be supported and wired up to our diognostics, but
	// ignore it for now to avoid breaking builds that use it.
	def fdiagnostics_show_location_EQ : Joined<["-"], "fdiagnostics-show-location=">, Group<clang_ignored_f_Group>;

	defm fcheck_new : BooleanFFlag<"check-new">, Group<clang_ignored_f_Group>;
	defm caller_saves : BooleanFFlag<"caller-saves">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm reorder_blocks : BooleanFFlag<"reorder-blocks">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm eliminate_unused_debug_types : BooleanFFlag<"eliminate-unused-debug-types">, Group<clang_ignored_f_Group>;
	defm branch_count_reg : BooleanFFlag<"branch-count-reg">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm default_inline : BooleanFFlag<"default-inline">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm fat_lto_objects : BooleanFFlag<"fat-lto-objects">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm float_store : BooleanFFlag<"float-store">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm friend_injection : BooleanFFlag<"friend-injection">, Group<clang_ignored_f_Group>;
	defm function_attribute_list : BooleanFFlag<"function-attribute-list">, Group<clang_ignored_f_Group>;
	defm gcse : BooleanFFlag<"gcse">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_after_reload: BooleanFFlag<"gcse-after-reload">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_las: BooleanFFlag<"gcse-las">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_sm: BooleanFFlag<"gcse-sm">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gnu : BooleanFFlag<"gnu">, Group<clang_ignored_f_Group>;
	defm implicit_templates : BooleanFFlag<"implicit-templates">, Group<clang_ignored_f_Group>;
	defm implement_inlines : BooleanFFlag<"implement-inlines">, Group<clang_ignored_f_Group>;
	defm merge_constants : BooleanFFlag<"merge-constants">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched : BooleanFFlag<"modulo-sched">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched_allow_regmoves : BooleanFFlag<"modulo-sched-allow-regmoves">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_functions_called_once : BooleanFFlag<"inline-functions-called-once">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def finline_limit_EQ : Joined<["-"], "finline-limit=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm finline_limit : BooleanFFlag<"inline-limit">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_small_functions : BooleanFFlag<"inline-small-functions">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ipa_cp : BooleanFFlag<"ipa-cp">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ivopts : BooleanFFlag<"ivopts">, Group<clang_ignored_gcc_optimization_f_Group>;
	def fsemantic_interposition : Flag<["-"], "fsemantic-interposition">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_semantic_interposition: Flag<["-"], "fno-semantic-interposition">, Group<f_Group>, Flags<[CC1Option]>;
	defm non_call_exceptions : BooleanFFlag<"non-call-exceptions">, Group<clang_ignored_f_Group>;
	defm peel_loops : BooleanFFlag<"peel-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm permissive : BooleanFFlag<"permissive">, Group<clang_ignored_f_Group>;
	defm prefetch_loop_arrays : BooleanFFlag<"prefetch-loop-arrays">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm printf : BooleanFFlag<"printf">, Group<clang_ignored_f_Group>;
	defm profile : BooleanFFlag<"profile">, Group<clang_ignored_f_Group>;
	defm profile_correction : BooleanFFlag<"profile-correction">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm profile_generate_sampling : BooleanFFlag<"profile-generate-sampling">, Group<clang_ignored_f_Group>;
	defm profile_reusedist : BooleanFFlag<"profile-reusedist">, Group<clang_ignored_f_Group>;
	defm profile_values : BooleanFFlag<"profile-values">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm regs_graph : BooleanFFlag<"regs-graph">, Group<clang_ignored_f_Group>;
	defm rename_registers : BooleanFFlag<"rename-registers">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm ripa : BooleanFFlag<"ripa">, Group<clang_ignored_f_Group>;
	defm schedule_insns : BooleanFFlag<"schedule-insns">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm schedule_insns2 : BooleanFFlag<"schedule-insns2">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm see : BooleanFFlag<"see">, Group<clang_ignored_f_Group>;
	defm signaling_nans : BooleanFFlag<"signaling-nans">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm single_precision_constant : BooleanFFlag<"single-precision-constant">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm spec_constr_count : BooleanFFlag<"spec-constr-count">, Group<clang_ignored_f_Group>;
	defm stack_check : BooleanFFlag<"stack-check">, Group<clang_ignored_f_Group>;
	defm strength_reduce :
	BooleanFFlag<"strength-reduce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tls_model : BooleanFFlag<"tls-model">, Group<clang_ignored_f_Group>;
	defm tracer : BooleanFFlag<"tracer">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_dce : BooleanFFlag<"tree-dce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_salias : BooleanFFlag<"tree-salias">, Group<clang_ignored_f_Group>;
	defm tree_ter : BooleanFFlag<"tree-ter">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_vectorizer_verbose : BooleanFFlag<"tree-vectorizer-verbose">, Group<clang_ignored_f_Group>;
	defm tree_vrp : BooleanFFlag<"tree-vrp">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unroll_all_loops : BooleanFFlag<"unroll-all-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unsafe_loop_optimizations : BooleanFFlag<"unsafe-loop-optimizations">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm unswitch_loops : BooleanFFlag<"unswitch-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm use_linker_plugin : BooleanFFlag<"use-linker-plugin">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm vect_cost_model : BooleanFFlag<"vect-cost-model">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm variable_expansion_in_unroller : BooleanFFlag<"variable-expansion-in-unroller">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm web : BooleanFFlag<"web">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm whole_program : BooleanFFlag<"whole-program">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize : BooleanFFlag<"devirtualize">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize_speculatively : BooleanFFlag<"devirtualize-speculatively">,
	Group<clang_ignored_gcc_optimization_f_Group>;

	// Generic gfortran options.
	def A_DASH : Joined<["-"], "A-">, Group<gfortran_Group>;
	def J : JoinedOrSeparate<["-"], "J">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>;
	def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>;
	def static_libgfortran : Flag<["-"], "static-libgfortran">, Group<gfortran_Group>;

	// "f" options with values for gfortran.
	def fblas_matmul_limit_EQ : Joined<["-"], "fblas-matmul-limit=">, Group<gfortran_Group>;
	def fcheck_EQ : Joined<["-"], "fcheck=">, Group<gfortran_Group>;
	def fcoarray_EQ : Joined<["-"], "fcoarray=">, Group<gfortran_Group>;
	def fconvert_EQ : Joined<["-"], "fconvert=">, Group<gfortran_Group>;
	def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>;
	def ffpe_trap_EQ : Joined<["-"], "ffpe-trap=">, Group<gfortran_Group>;
	def ffree_line_length_VALUE : Joined<["-"], "ffree-line-length-">, Group<gfortran_Group>;
	def finit_character_EQ : Joined<["-"], "finit-character=">, Group<gfortran_Group>;
	def finit_integer_EQ : Joined<["-"], "finit-integer=">, Group<gfortran_Group>;
	def finit_logical_EQ : Joined<["-"], "finit-logical=">, Group<gfortran_Group>;
	def finit_real_EQ : Joined<["-"], "finit-real=">, Group<gfortran_Group>;
	def fmax_array_constructor_EQ : Joined<["-"], "fmax-array-constructor=">, Group<gfortran_Group>;
	def fmax_errors_EQ : Joined<["-"], "fmax-errors=">, Group<gfortran_Group>;
	def fmax_stack_var_size_EQ : Joined<["-"], "fmax-stack-var-size=">, Group<gfortran_Group>;
	def fmax_subrecord_length_EQ : Joined<["-"], "fmax-subrecord-length=">, Group<gfortran_Group>;
	def frecord_marker_EQ : Joined<["-"], "frecord-marker=">, Group<gfortran_Group>;

	// "f" flags for gfortran.
	defm aggressive_function_elimination : BooleanFFlag<"aggressive-function-elimination">, Group<gfortran_Group>;
	defm align_commons : BooleanFFlag<"align-commons">, Group<gfortran_Group>;
	defm all_intrinsics : BooleanFFlag<"all-intrinsics">, Group<gfortran_Group>;
	defm automatic : BooleanFFlag<"automatic">, Group<gfortran_Group>;
	defm backslash : BooleanFFlag<"backslash">, Group<gfortran_Group>;
	defm backtrace : BooleanFFlag<"backtrace">, Group<gfortran_Group>;
	defm bounds_check : BooleanFFlag<"bounds-check">, Group<gfortran_Group>;
	defm check_array_temporaries : BooleanFFlag<"check-array-temporaries">, Group<gfortran_Group>;
	defm cray_pointer : BooleanFFlag<"cray-pointer">, Group<gfortran_Group>;
	defm d_lines_as_code : BooleanFFlag<"d-lines-as-code">, Group<gfortran_Group>;
	defm d_lines_as_comments : BooleanFFlag<"d-lines-as-comments">, Group<gfortran_Group>;
	defm default_double_8 : BooleanFFlag<"default-double-8">, Group<gfortran_Group>;
	defm default_integer_8 : BooleanFFlag<"default-integer-8">, Group<gfortran_Group>;
	defm default_real_8 : BooleanFFlag<"default-real-8">, Group<gfortran_Group>;
	defm dollar_ok : BooleanFFlag<"dollar-ok">, Group<gfortran_Group>;
	defm dump_fortran_optimized : BooleanFFlag<"dump-fortran-optimized">, Group<gfortran_Group>;
	defm dump_fortran_original : BooleanFFlag<"dump-fortran-original">, Group<gfortran_Group>;
	defm dump_parse_tree : BooleanFFlag<"dump-parse-tree">, Group<gfortran_Group>;
	defm external_blas : BooleanFFlag<"external-blas">, Group<gfortran_Group>;
	defm f2c : BooleanFFlag<"f2c">, Group<gfortran_Group>;
	defm fixed_form : BooleanFFlag<"fixed-form">, Group<gfortran_Group>;
	defm free_form : BooleanFFlag<"free-form">, Group<gfortran_Group>;
	defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<gfortran_Group>;
	defm implicit_none : BooleanFFlag<"implicit-none">, Group<gfortran_Group>;
	defm init_local_zero : BooleanFFlag<"init-local-zero">, Group<gfortran_Group>;
	defm integer_4_integer_8 : BooleanFFlag<"integer-4-integer-8">, Group<gfortran_Group>;
	defm intrinsic_modules_path : BooleanFFlag<"intrinsic-modules-path">, Group<gfortran_Group>;
	defm max_identifier_length : BooleanFFlag<"max-identifier-length">, Group<gfortran_Group>;
	defm module_private : BooleanFFlag<"module-private">, Group<gfortran_Group>;
	defm pack_derived : BooleanFFlag<"pack-derived">, Group<gfortran_Group>;
	defm protect_parens : BooleanFFlag<"protect-parens">, Group<gfortran_Group>;
	defm range_check : BooleanFFlag<"range-check">, Group<gfortran_Group>;
	defm real_4_real_10 : BooleanFFlag<"real-4-real-10">, Group<gfortran_Group>;
	defm real_4_real_16 : BooleanFFlag<"real-4-real-16">, Group<gfortran_Group>;
	defm real_4_real_8 : BooleanFFlag<"real-4-real-8">, Group<gfortran_Group>;
	defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>;
	defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>;
	defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>;
	defm realloc_lhs : BooleanFFlag<"realloc-lhs">, Group<gfortran_Group>;
	defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>;
	defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>;
	defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>;
	defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>;
	defm stack_arrays : BooleanFFlag<"stack-arrays">, Group<gfortran_Group>;
	defm underscoring : BooleanFFlag<"underscoring">, Group<gfortran_Group>;
	defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;

	// C++ SYCL options
	def fsycl : Flag<["-"], "fsycl">, Group<sycl_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Enable SYCL kernels compilation for device">;
	def fno_sycl : Flag<["-"], "fno-sycl">, Group<sycl_Group>, Flags<[CoreOption]>,
	HelpText<"Disable SYCL kernels compilation for device">;
	def sycl_std_EQ : Joined<["-"], "sycl-std=">, Group<sycl_Group>, Flags<[CC1Option, NoArgumentUnused, CoreOption]>,
	HelpText<"SYCL language standard to compile for.">, Values<"2017, 121, 1.2.1, sycl-1.2.1">;

	//===----------------------------------------------------------------------===//
	// CC1 Options
	//===----------------------------------------------------------------------===//

	let Flags = [CC1Option, NoDriverOption] in {

	//===----------------------------------------------------------------------===//
	// Target Options
	//===----------------------------------------------------------------------===//

	let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {

	def target_cpu : Separate<["-"], "target-cpu">,
	HelpText<"Target a specific cpu type">;
	def target_feature : Separate<["-"], "target-feature">,
	HelpText<"Target specific attributes">;
	def triple : Separate<["-"], "triple">,
	HelpText<"Specify target triple (e.g. i686-apple-darwin9)">,
	MarshallingInfoString<"TargetOpts->Triple", "llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple())", "std::string">,
	AlwaysEmit, Normalizer<"normalizeTriple">, DenormalizeString;
	def target_abi : Separate<["-"], "target-abi">,
	HelpText<"Target a particular ABI type">;
	def target_sdk_version_EQ : Joined<["-"], "target-sdk-version=">,
	HelpText<"The version of target SDK used for compilation">;

	}

	def target_linker_version : Separate<["-"], "target-linker-version">,
	HelpText<"Target linker version">;
	def triple_EQ : Joined<["-"], "triple=">, Alias<triple>;
	def mfpmath : Separate<["-"], "mfpmath">,
	HelpText<"Which unit to use for fp math">;

	def fpadding_on_unsigned_fixed_point : Flag<["-"], "fpadding-on-unsigned-fixed-point">,
	HelpText<"Force each unsigned fixed point type to have an extra bit of padding to align their scales with those of signed fixed point types">;
	def fno_padding_on_unsigned_fixed_point : Flag<["-"], "fno-padding-on-unsigned-fixed-point">;

	//===----------------------------------------------------------------------===//
	// Analyzer Options
	//===----------------------------------------------------------------------===//

	def analysis_UnoptimizedCFG : Flag<["-"], "unoptimized-cfg">,
	HelpText<"Generate unoptimized CFGs for all analyses">;
	def analysis_CFGAddImplicitDtors : Flag<["-"], "cfg-add-implicit-dtors">,
	HelpText<"Add C++ implicit destructors to CFGs for all analyses">;

	def analyzer_store : Separate<["-"], "analyzer-store">,
	HelpText<"Source Code Analysis - Abstract Memory Store Models">;
	def analyzer_store_EQ : Joined<["-"], "analyzer-store=">, Alias<analyzer_store>;

	def analyzer_constraints : Separate<["-"], "analyzer-constraints">,
	HelpText<"Source Code Analysis - Symbolic Constraint Engines">;
	def analyzer_constraints_EQ : Joined<["-"], "analyzer-constraints=">,
	Alias<analyzer_constraints>;

	def analyzer_output : Separate<["-"], "analyzer-output">,
	HelpText<"Source Code Analysis - Output Options">;
	def analyzer_output_EQ : Joined<["-"], "analyzer-output=">,
	Alias<analyzer_output>;

	def analyzer_purge : Separate<["-"], "analyzer-purge">,
	HelpText<"Source Code Analysis - Dead Symbol Removal Frequency">;
	def analyzer_purge_EQ : Joined<["-"], "analyzer-purge=">, Alias<analyzer_purge>;

	def analyzer_opt_analyze_headers : Flag<["-"], "analyzer-opt-analyze-headers">,
	HelpText<"Force the static analyzer to analyze functions defined in header files">;
	def analyzer_opt_analyze_nested_blocks : Flag<["-"], "analyzer-opt-analyze-nested-blocks">,
	HelpText<"Analyze the definitions of blocks in addition to functions">;
	def analyzer_display_progress : Flag<["-"], "analyzer-display-progress">,
	HelpText<"Emit verbose output about the analyzer's progress">;
	def analyze_function : Separate<["-"], "analyze-function">,
	HelpText<"Run analysis on specific function (for C++ include parameters in name)">;
	def analyze_function_EQ : Joined<["-"], "analyze-function=">, Alias<analyze_function>;
	def trim_egraph : Flag<["-"], "trim-egraph">,
	HelpText<"Only show error-related paths in the analysis graph">;
	def analyzer_viz_egraph_graphviz : Flag<["-"], "analyzer-viz-egraph-graphviz">,
	HelpText<"Display exploded graph using GraphViz">;
	def analyzer_dump_egraph : Separate<["-"], "analyzer-dump-egraph">,
	HelpText<"Dump exploded graph to the specified file">;
	def analyzer_dump_egraph_EQ : Joined<["-"], "analyzer-dump-egraph=">, Alias<analyzer_dump_egraph>;

	def analyzer_inline_max_stack_depth : Separate<["-"], "analyzer-inline-max-stack-depth">,
	HelpText<"Bound on stack depth while inlining (4 by default)">;
	def analyzer_inline_max_stack_depth_EQ : Joined<["-"], "analyzer-inline-max-stack-depth=">,
	Alias<analyzer_inline_max_stack_depth>;

	def analyzer_inlining_mode : Separate<["-"], "analyzer-inlining-mode">,
	HelpText<"Specify the function selection heuristic used during inlining">;
	def analyzer_inlining_mode_EQ : Joined<["-"], "analyzer-inlining-mode=">, Alias<analyzer_inlining_mode>;

	def analyzer_disable_retry_exhausted : Flag<["-"], "analyzer-disable-retry-exhausted">,
	HelpText<"Do not re-analyze paths leading to exhausted nodes with a different strategy (may decrease code coverage)">;

	def analyzer_max_loop : Separate<["-"], "analyzer-max-loop">,
	HelpText<"The maximum number of times the analyzer will go through a loop">;
	def analyzer_stats : Flag<["-"], "analyzer-stats">,
	HelpText<"Print internal analyzer statistics.">;

	def analyzer_checker : Separate<["-"], "analyzer-checker">,
	HelpText<"Choose analyzer checkers to enable">,
	ValuesCode<[{
	const char *Values =
	#define GET_CHECKERS
	#define CHECKER(FULLNAME, CLASS, HT, DOC_URI, IS_HIDDEN) FULLNAME ","
	#include "clang/StaticAnalyzer/Checkers/Checkers.inc"
	#undef GET_CHECKERS
	#define GET_PACKAGES
	#define PACKAGE(FULLNAME) FULLNAME ","
	#include "clang/StaticAnalyzer/Checkers/Checkers.inc"
	#undef GET_PACKAGES
	;
	}]>;
	def analyzer_checker_EQ : Joined<["-"], "analyzer-checker=">,
	Alias<analyzer_checker>;

	def analyzer_disable_checker : Separate<["-"], "analyzer-disable-checker">,
	HelpText<"Choose analyzer checkers to disable">;
	def analyzer_disable_checker_EQ : Joined<["-"], "analyzer-disable-checker=">,
	Alias<analyzer_disable_checker>;

	def analyzer_disable_all_checks : Flag<["-"], "analyzer-disable-all-checks">,
	HelpText<"Disable all static analyzer checks">;

	def analyzer_checker_help : Flag<["-"], "analyzer-checker-help">,
	HelpText<"Display the list of analyzer checkers that are available">;

	def analyzer_checker_help_alpha : Flag<["-"], "analyzer-checker-help-alpha">,
	HelpText<"Display the list of in development analyzer checkers. These "
	"are NOT considered safe, they are unstable and will emit incorrect "
	"reports. Enable ONLY FOR DEVELOPMENT purposes">;

	def analyzer_checker_help_developer : Flag<["-"], "analyzer-checker-help-developer">,
	HelpText<"Display the list of developer-only checkers such as modeling "
	"and debug checkers">;

	def analyzer_config_help : Flag<["-"], "analyzer-config-help">,
	HelpText<"Display the list of -analyzer-config options. These are meant for "
	"development purposes only!">;

	def analyzer_list_enabled_checkers : Flag<["-"], "analyzer-list-enabled-checkers">,
	HelpText<"Display the list of enabled analyzer checkers">;

	def analyzer_config : Separate<["-"], "analyzer-config">,
	HelpText<"Choose analyzer options to enable">;

	def analyzer_checker_option_help : Flag<["-"], "analyzer-checker-option-help">,
	HelpText<"Display the list of checker and package options">;

	def analyzer_checker_option_help_alpha : Flag<["-"], "analyzer-checker-option-help-alpha">,
	HelpText<"Display the list of in development checker and package options. "
	"These are NOT considered safe, they are unstable and will emit "
	"incorrect reports. Enable ONLY FOR DEVELOPMENT purposes">;

	def analyzer_checker_option_help_developer : Flag<["-"], "analyzer-checker-option-help-developer">,
	HelpText<"Display the list of checker and package options meant for "
	"development purposes only">;

	def analyzer_config_compatibility_mode : Separate<["-"], "analyzer-config-compatibility-mode">,
	HelpText<"Don't emit errors on invalid analyzer-config inputs">;

	def analyzer_config_compatibility_mode_EQ : Joined<["-"], "analyzer-config-compatibility-mode=">,
	Alias<analyzer_config_compatibility_mode>;

	def analyzer_werror : Flag<["-"], "analyzer-werror">,
	HelpText<"Emit analyzer results as errors rather than warnings">;

	//===----------------------------------------------------------------------===//
	// Migrator Options
	//===----------------------------------------------------------------------===//
	def migrator_no_nsalloc_error : Flag<["-"], "no-ns-alloc-error">,
	HelpText<"Do not error on use of NSAllocateCollectable/NSReallocateCollectable">;

	def migrator_no_finalize_removal : Flag<["-"], "no-finalize-removal">,
	HelpText<"Do not remove finalize method in gc mode">;

	//===----------------------------------------------------------------------===//
	// CodeGen Options
	//===----------------------------------------------------------------------===//

	let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {
	def debug_info_kind_EQ : Joined<["-"], "debug-info-kind=">;
	def debug_info_macro : Flag<["-"], "debug-info-macro">,
	HelpText<"Emit macro debug information">;
	def default_function_attr : Separate<["-"], "default-function-attr">,
	HelpText<"Apply given attribute to all functions">;
	def dwarf_version_EQ : Joined<["-"], "dwarf-version=">;
	def debugger_tuning_EQ : Joined<["-"], "debugger-tuning=">;
	def dwarf_debug_flags : Separate<["-"], "dwarf-debug-flags">,
	HelpText<"The string to embed in the Dwarf debug flags record.">;
	def record_command_line : Separate<["-"], "record-command-line">,
	HelpText<"The string to embed in the .LLVM.command.line section.">;
	def compress_debug_sections : Flag<["-", "--"], "compress-debug-sections">,
	HelpText<"DWARF debug sections compression">;
	def compress_debug_sections_EQ : Joined<["-", "--"], "compress-debug-sections=">,
	HelpText<"DWARF debug sections compression type">;
	def mno_exec_stack : Flag<["-"], "mnoexecstack">,
	HelpText<"Mark the file as not needing an executable stack">;
	def massembler_no_warn : Flag<["-"], "massembler-no-warn">,
	HelpText<"Make assembler not emit warnings">;
	def massembler_fatal_warnings : Flag<["-"], "massembler-fatal-warnings">,
	HelpText<"Make assembler warnings fatal">;
	def mrelax_relocations : Flag<["--"], "mrelax-relocations">,
	HelpText<"Use relaxable elf relocations">;
	def msave_temp_labels : Flag<["-"], "msave-temp-labels">,
	HelpText<"Save temporary labels in the symbol table. "
	"Note this may change .s semantics and shouldn't generally be used "
	"on compiler-generated code.">;
	def mrelocation_model : Separate<["-"], "mrelocation-model">,
	HelpText<"The relocation model to use">, Values<"static,pic,ropi,rwpi,ropi-rwpi,dynamic-no-pic">,
	NormalizedValuesScope<"llvm::Reloc">,
	NormalizedValues<["Static", "PIC_", "ROPI", "RWPI", "ROPI_RWPI", "DynamicNoPIC"]>,
	MarshallingInfoString<"CodeGenOpts.RelocationModel", "PIC_", "Model">,
	AutoNormalizeEnum;
	def fno_math_builtin : Flag<["-"], "fno-math-builtin">,
	HelpText<"Disable implicit builtin knowledge of math functions">;
	}

	def disable_llvm_verifier : Flag<["-"], "disable-llvm-verifier">,
	HelpText<"Don't run the LLVM IR verifier pass">;
	def disable_llvm_passes : Flag<["-"], "disable-llvm-passes">,
	HelpText<"Use together with -emit-llvm to get pristine LLVM IR from the "
	"frontend by not running any LLVM passes at all">;
	def disable_llvm_optzns : Flag<["-"], "disable-llvm-optzns">,
	Alias<disable_llvm_passes>;
	def disable_lifetimemarkers : Flag<["-"], "disable-lifetime-markers">,
	HelpText<"Disable lifetime-markers emission even when optimizations are "
	"enabled">;
	def disable_O0_optnone : Flag<["-"], "disable-O0-optnone">,
	HelpText<"Disable adding the optnone attribute to functions at O0">;
	def disable_red_zone : Flag<["-"], "disable-red-zone">,
	HelpText<"Do not emit code that uses the red zone.">;
	def dwarf_ext_refs : Flag<["-"], "dwarf-ext-refs">,
	HelpText<"Generate debug info with external references to clang modules"
	" or precompiled headers">;
	def dwarf_explicit_import : Flag<["-"], "dwarf-explicit-import">,
	HelpText<"Generate explicit import from anonymous namespace to containing"
	" scope">;
	def debug_forward_template_params : Flag<["-"], "debug-forward-template-params">,
	HelpText<"Emit complete descriptions of template parameters in forward"
	" declarations">;
	def fforbid_guard_variables : Flag<["-"], "fforbid-guard-variables">,
	HelpText<"Emit an error if a C++ static local initializer would need a guard variable">;
	def no_implicit_float : Flag<["-"], "no-implicit-float">,
	HelpText<"Don't generate implicit floating point instructions">;
	def fdump_vtable_layouts : Flag<["-"], "fdump-vtable-layouts">,
	HelpText<"Dump the layouts of all vtables that will be emitted in a translation unit">;
	def fmerge_functions : Flag<["-"], "fmerge-functions">,
	HelpText<"Permit merging of identical functions when optimizing.">;
	def femit_coverage_notes : Flag<["-"], "femit-coverage-notes">,
	HelpText<"Emit a gcov coverage notes file when compiling.">;
	def femit_coverage_data: Flag<["-"], "femit-coverage-data">,
	HelpText<"Instrument the program to emit gcov coverage data when run.">;
	def coverage_data_file : Separate<["-"], "coverage-data-file">,
	HelpText<"Emit coverage data to this filename.">;
	def coverage_data_file_EQ : Joined<["-"], "coverage-data-file=">,
	Alias<coverage_data_file>;
	def coverage_notes_file : Separate<["-"], "coverage-notes-file">,
	HelpText<"Emit coverage notes to this filename.">;
	def coverage_notes_file_EQ : Joined<["-"], "coverage-notes-file=">,
	Alias<coverage_notes_file>;
	def coverage_version_EQ : Joined<["-"], "coverage-version=">,
	HelpText<"Four-byte version string for gcov files.">;
	def test_coverage : Flag<["-"], "test-coverage">,
	HelpText<"Do not generate coverage files or remove coverage changes from IR">;
	def dump_coverage_mapping : Flag<["-"], "dump-coverage-mapping">,
	HelpText<"Dump the coverage mapping records, for testing">;
	def fuse_register_sized_bitfield_access: Flag<["-"], "fuse-register-sized-bitfield-access">,
	HelpText<"Use register sized accesses to bit-fields, when possible.">;
	def relaxed_aliasing : Flag<["-"], "relaxed-aliasing">,
	HelpText<"Turn off Type Based Alias Analysis">;
	def no_struct_path_tbaa : Flag<["-"], "no-struct-path-tbaa">,
	HelpText<"Turn off struct-path aware Type Based Alias Analysis">;
	def new_struct_path_tbaa : Flag<["-"], "new-struct-path-tbaa">,
	HelpText<"Enable enhanced struct-path aware Type Based Alias Analysis">;
	def mdebug_pass : Separate<["-"], "mdebug-pass">,
	HelpText<"Enable additional debug output">;
	def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
	HelpText<"Specify which frame pointers to retain (all, non-leaf, none).">, Values<"all,non-leaf,none">;
	def mdisable_tail_calls : Flag<["-"], "mdisable-tail-calls">,
	HelpText<"Disable tail call optimization, keeping the call stack accurate">;
	def menable_no_infinities : Flag<["-"], "menable-no-infs">,
	HelpText<"Allow optimization to assume there are no infinities.">;
	def menable_no_nans : Flag<["-"], "menable-no-nans">,
	HelpText<"Allow optimization to assume there are no NaNs.">;
	def menable_unsafe_fp_math : Flag<["-"], "menable-unsafe-fp-math">,
	HelpText<"Allow unsafe floating-point math optimizations which may decrease "
	"precision">;
	def mreassociate : Flag<["-"], "mreassociate">,
	HelpText<"Allow reassociation transformations for floating-point instructions">;
	def mabi_EQ_ieeelongdouble : Flag<["-"], "mabi=ieeelongdouble">,
	HelpText<"Use IEEE 754 quadruple-precision for long double">;
	def mfloat_abi : Separate<["-"], "mfloat-abi">,
	HelpText<"The float ABI to use">;
	def mtp : Separate<["-"], "mtp">,
	HelpText<"Mode for reading thread pointer">;
	def mlimit_float_precision : Separate<["-"], "mlimit-float-precision">,
	HelpText<"Limit float precision to the given value">;
	def split_stacks : Flag<["-"], "split-stacks">,
	HelpText<"Try to use a split stack if possible.">;
	def mregparm : Separate<["-"], "mregparm">,
	HelpText<"Limit the number of registers available for integer arguments">;
	def msmall_data_limit : Separate<["-"], "msmall-data-limit">,
	HelpText<"Put global and static data smaller than the limit into a special section">;
	def munwind_tables : Flag<["-"], "munwind-tables">,
	HelpText<"Generate unwinding tables for all functions">;
	def mconstructor_aliases : Flag<["-"], "mconstructor-aliases">,
	HelpText<"Emit complete constructors and destructors as aliases when possible">;
	def mlink_bitcode_file : Separate<["-"], "mlink-bitcode-file">,
	HelpText<"Link the given bitcode file before performing optimizations.">;
	def mlink_builtin_bitcode : Separate<["-"], "mlink-builtin-bitcode">,
	HelpText<"Link and internalize needed symbols from the given bitcode file "
	"before performing optimizations.">;
	def mlink_cuda_bitcode : Separate<["-"], "mlink-cuda-bitcode">,
	Alias<mlink_builtin_bitcode>;
	def vectorize_loops : Flag<["-"], "vectorize-loops">,
	HelpText<"Run the Loop vectorization passes">;
	def vectorize_slp : Flag<["-"], "vectorize-slp">,
	HelpText<"Run the SLP vectorization passes">;
	def dependent_lib : Joined<["--"], "dependent-lib=">,
	HelpText<"Add dependent library">;
	def linker_option : Joined<["--"], "linker-option=">,
	HelpText<"Add linker option">;
	def fsanitize_coverage_type : Joined<["-"], "fsanitize-coverage-type=">,
	HelpText<"Sanitizer coverage type">;
	def fsanitize_coverage_indirect_calls
	: Flag<["-"], "fsanitize-coverage-indirect-calls">,
	HelpText<"Enable sanitizer coverage for indirect calls">;
	def fsanitize_coverage_trace_bb
	: Flag<["-"], "fsanitize-coverage-trace-bb">,
	HelpText<"Enable basic block tracing in sanitizer coverage">;
	def fsanitize_coverage_trace_cmp
	: Flag<["-"], "fsanitize-coverage-trace-cmp">,
	HelpText<"Enable cmp instruction tracing in sanitizer coverage">;
	def fsanitize_coverage_trace_div
	: Flag<["-"], "fsanitize-coverage-trace-div">,
	HelpText<"Enable div instruction tracing in sanitizer coverage">;
	def fsanitize_coverage_trace_gep
	: Flag<["-"], "fsanitize-coverage-trace-gep">,
	HelpText<"Enable gep instruction tracing in sanitizer coverage">;
	def fsanitize_coverage_8bit_counters
	: Flag<["-"], "fsanitize-coverage-8bit-counters">,
	HelpText<"Enable frequency counters in sanitizer coverage">;
	def fsanitize_coverage_inline_8bit_counters
	: Flag<["-"], "fsanitize-coverage-inline-8bit-counters">,
	HelpText<"Enable inline 8-bit counters in sanitizer coverage">;
	def fsanitize_coverage_inline_bool_flag
	: Flag<["-"], "fsanitize-coverage-inline-bool-flag">,
	HelpText<"Enable inline bool flag in sanitizer coverage">;
	def fsanitize_coverage_pc_table
	: Flag<["-"], "fsanitize-coverage-pc-table">,
	HelpText<"Create a table of coverage-instrumented PCs">;
	def fsanitize_coverage_trace_pc
	: Flag<["-"], "fsanitize-coverage-trace-pc">,
	HelpText<"Enable PC tracing in sanitizer coverage">;
	def fsanitize_coverage_trace_pc_guard
	: Flag<["-"], "fsanitize-coverage-trace-pc-guard">,
	HelpText<"Enable PC tracing with guard in sanitizer coverage">;
	def fsanitize_coverage_no_prune
	: Flag<["-"], "fsanitize-coverage-no-prune">,
	HelpText<"Disable coverage pruning (i.e. instrument all blocks/edges)">;
	def fsanitize_coverage_stack_depth
	: Flag<["-"], "fsanitize-coverage-stack-depth">,
	HelpText<"Enable max stack depth tracing">;
	def fpatchable_function_entry_offset_EQ
	: Joined<["-"], "fpatchable-function-entry-offset=">, MetaVarName<"<M>">,
	HelpText<"Generate M NOPs before function entry">;
	def fprofile_instrument_EQ : Joined<["-"], "fprofile-instrument=">,
	HelpText<"Enable PGO instrumentation. The accepted value is clang, llvm, "
	"or none">, Values<"none,clang,llvm">;
	def fprofile_instrument_path_EQ : Joined<["-"], "fprofile-instrument-path=">,
	HelpText<"Generate instrumented code to collect execution counts into "
	"<file> (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_instrument_use_path_EQ :
	Joined<["-"], "fprofile-instrument-use-path=">,
	HelpText<"Specify the profile path in PGO use compilation">;
	def flto_visibility_public_std:
	Flag<["-"], "flto-visibility-public-std">,
	HelpText<"Use public LTO visibility for classes in std and stdext namespaces">;
	def flto_unit: Flag<["-"], "flto-unit">,
	HelpText<"Emit IR to support LTO unit features (CFI, whole program vtable opt)">;
	def fno_lto_unit: Flag<["-"], "fno-lto-unit">;
	def fdebug_pass_manager : Flag<["-"], "fdebug-pass-manager">,
	HelpText<"Prints debug information for the new pass manager">;
	def fno_debug_pass_manager : Flag<["-"], "fno-debug-pass-manager">,
	HelpText<"Disables debug printing for the new pass manager">;
	// The driver option takes the key as a parameter to the -msign-return-address=
	// and -mbranch-protection= options, but CC1 has a separate option so we
	// don't have to parse the parameter twice.
	def msign_return_address_key_EQ : Joined<["-"], "msign-return-address-key=">,
	Values<"a_key,b_key">;
	def mbranch_target_enforce : Flag<["-"], "mbranch-target-enforce">;
	def fno_dllexport_inlines : Flag<["-"], "fno-dllexport-inlines">;
	def cfguard_no_checks : Flag<["-"], "cfguard-no-checks">,
	HelpText<"Emit Windows Control Flow Guard tables only (no checks)">;
	def cfguard : Flag<["-"], "cfguard">,
	HelpText<"Emit Windows Control Flow Guard tables and checks">;

	def fdenormal_fp_math_f32_EQ : Joined<["-"], "fdenormal-fp-math-f32=">,
	Group<f_Group>;

	//===----------------------------------------------------------------------===//
	// Dependency Output Options
	//===----------------------------------------------------------------------===//

	def sys_header_deps : Flag<["-"], "sys-header-deps">,
	HelpText<"Include system headers in dependency output">;
	def module_file_deps : Flag<["-"], "module-file-deps">,
	HelpText<"Include module files in dependency output">;
	def header_include_file : Separate<["-"], "header-include-file">,
	HelpText<"Filename (or -) to write header include output to">;
	def show_includes : Flag<["--"], "show-includes">,
	HelpText<"Print cl.exe style /showIncludes to stdout">;

	//===----------------------------------------------------------------------===//
	// Diagnostic Options
	//===----------------------------------------------------------------------===//

	def diagnostic_log_file : Separate<["-"], "diagnostic-log-file">,
	HelpText<"Filename (or -) to log diagnostics to">;
	def diagnostic_serialized_file : Separate<["-"], "serialize-diagnostic-file">,
	MetaVarName<"<filename>">,
	HelpText<"File for serializing diagnostics in a binary format">;

	def fdiagnostics_format : Separate<["-"], "fdiagnostics-format">,
	HelpText<"Change diagnostic formatting to match IDE and command line tools">, Values<"clang,msvc,msvc-fallback,vi">;
	def fdiagnostics_show_category : Separate<["-"], "fdiagnostics-show-category">,
	HelpText<"Print diagnostic category">, Values<"none,id,name">;
	def fno_diagnostics_use_presumed_location : Flag<["-"], "fno-diagnostics-use-presumed-location">,
	HelpText<"Ignore #line directives when displaying diagnostic locations">;
	def ftabstop : Separate<["-"], "ftabstop">, MetaVarName<"<N>">,
	HelpText<"Set the tab stop distance.">;
	def ferror_limit : Separate<["-"], "ferror-limit">, MetaVarName<"<N>">,
	HelpText<"Set the maximum number of errors to emit before stopping (0 = no limit).">;
	def fmacro_backtrace_limit : Separate<["-"], "fmacro-backtrace-limit">, MetaVarName<"<N>">,
	HelpText<"Set the maximum number of entries to print in a macro expansion backtrace (0 = no limit).">;
	def ftemplate_backtrace_limit : Separate<["-"], "ftemplate-backtrace-limit">, MetaVarName<"<N>">,
	HelpText<"Set the maximum number of entries to print in a template instantiation backtrace (0 = no limit).">;
	def fconstexpr_backtrace_limit : Separate<["-"], "fconstexpr-backtrace-limit">, MetaVarName<"<N>">,
	HelpText<"Set the maximum number of entries to print in a constexpr evaluation backtrace (0 = no limit).">;
	def fspell_checking_limit : Separate<["-"], "fspell-checking-limit">, MetaVarName<"<N>">,
	HelpText<"Set the maximum number of times to perform spell checking on unrecognized identifiers (0 = no limit).">;
	def fcaret_diagnostics_max_lines :
	Separate<["-"], "fcaret-diagnostics-max-lines">, MetaVarName<"<N>">,
	HelpText<"Set the maximum number of source lines to show in a caret diagnostic">;
	def verify_EQ : CommaJoined<["-"], "verify=">,
	MetaVarName<"<prefixes>">,
	HelpText<"Verify diagnostic output using comment directives that start with"
	" prefixes in the comma-separated sequence <prefixes>">;
	def verify : Flag<["-"], "verify">,
	HelpText<"Equivalent to -verify=expected">;
	def verify_ignore_unexpected : Flag<["-"], "verify-ignore-unexpected">,
	HelpText<"Ignore unexpected diagnostic messages">;
	def verify_ignore_unexpected_EQ : CommaJoined<["-"], "verify-ignore-unexpected=">,
	HelpText<"Ignore unexpected diagnostic messages">;
	def Wno_rewrite_macros : Flag<["-"], "Wno-rewrite-macros">,
	HelpText<"Silence ObjC rewriting warnings">;

	//===----------------------------------------------------------------------===//
	// Frontend Options
	//===----------------------------------------------------------------------===//

	// This isn't normally used, it is just here so we can parse a
	// CompilerInvocation out of a driver-derived argument vector.
	def cc1 : Flag<["-"], "cc1">;
	def cc1as : Flag<["-"], "cc1as">;

	def ast_merge : Separate<["-"], "ast-merge">,
	MetaVarName<"<ast file>">,
	HelpText<"Merge the given AST file into the translation unit being compiled.">;
	def aux_target_cpu : Separate<["-"], "aux-target-cpu">,
	HelpText<"Target a specific auxiliary cpu type">;
	def aux_target_feature : Separate<["-"], "aux-target-feature">,
	HelpText<"Target specific auxiliary attributes">;
	def aux_triple : Separate<["-"], "aux-triple">,
	HelpText<"Auxiliary target triple.">;
	def code_completion_at : Separate<["-"], "code-completion-at">,
	MetaVarName<"<file>:<line>:<column>">,
	HelpText<"Dump code-completion information at a location">;
	def remap_file : Separate<["-"], "remap-file">,
	MetaVarName<"<from>;<to>">,
	HelpText<"Replace the contents of the <from> file with the contents of the <to> file">;
	def code_completion_at_EQ : Joined<["-"], "code-completion-at=">,
	Alias<code_completion_at>;
	def code_completion_macros : Flag<["-"], "code-completion-macros">,
	HelpText<"Include macros in code-completion results">;
	def code_completion_patterns : Flag<["-"], "code-completion-patterns">,
	HelpText<"Include code patterns in code-completion results">;
	def no_code_completion_globals : Flag<["-"], "no-code-completion-globals">,
	HelpText<"Do not include global declarations in code-completion results.">;
	def no_code_completion_ns_level_decls : Flag<["-"], "no-code-completion-ns-level-decls">,
	HelpText<"Do not include declarations inside namespaces (incl. global namespace) in the code-completion results.">;
	def code_completion_brief_comments : Flag<["-"], "code-completion-brief-comments">,
	HelpText<"Include brief documentation comments in code-completion results.">;
	def code_completion_with_fixits : Flag<["-"], "code-completion-with-fixits">,
	HelpText<"Include code completion results which require small fix-its.">;
	def disable_free : Flag<["-"], "disable-free">,
	HelpText<"Disable freeing of memory on exit">;
	def discard_value_names : Flag<["-"], "discard-value-names">,
	HelpText<"Discard value names in LLVM IR">;
	def load : Separate<["-"], "load">, MetaVarName<"<dsopath>">,
	HelpText<"Load the named plugin (dynamic shared object)">;
	def plugin : Separate<["-"], "plugin">, MetaVarName<"<name>">,
	HelpText<"Use the named plugin action instead of the default action (use \"help\" to list available options)">;
	def plugin_arg : JoinedAndSeparate<["-"], "plugin-arg-">,
	MetaVarName<"<name> <arg>">,
	HelpText<"Pass <arg> to plugin <name>">;
	def add_plugin : Separate<["-"], "add-plugin">, MetaVarName<"<name>">,
	HelpText<"Use the named plugin action in addition to the default action">;
	def ast_dump_filter : Separate<["-"], "ast-dump-filter">,
	MetaVarName<"<dump_filter>">,
	HelpText<"Use with -ast-dump or -ast-print to dump/print only AST declaration"
	" nodes having a certain substring in a qualified name. Use"
	" -ast-list to list all filterable declaration node names.">;
	def fno_modules_global_index : Flag<["-"], "fno-modules-global-index">,
	HelpText<"Do not automatically generate or update the global module index">;
	def fno_modules_error_recovery : Flag<["-"], "fno-modules-error-recovery">,
	HelpText<"Do not automatically import modules for error recovery">;
	def fmodule_map_file_home_is_cwd : Flag<["-"], "fmodule-map-file-home-is-cwd">,
	HelpText<"Use the current working directory as the home directory of "
	"module maps specified by -fmodule-map-file=<FILE>">;
	def fmodule_feature : Separate<["-"], "fmodule-feature">,
	MetaVarName<"<feature>">,
	HelpText<"Enable <feature> in module map requires declarations">;
	def fmodules_embed_file_EQ : Joined<["-"], "fmodules-embed-file=">,
	MetaVarName<"<file>">,
	HelpText<"Embed the contents of the specified file into the module file "
	"being compiled.">;
	def fmodules_embed_all_files : Joined<["-"], "fmodules-embed-all-files">,
	HelpText<"Embed the contents of all files read by this compilation into "
	"the produced module file.">;
	def fmodules_local_submodule_visibility :
	Flag<["-"], "fmodules-local-submodule-visibility">,
	HelpText<"Enforce name visibility rules across submodules of the same "
	"top-level module.">;
	def fmodules_codegen :
	Flag<["-"], "fmodules-codegen">,
	HelpText<"Generate code for uses of this module that assumes an explicit "
	"object file will be built for the module">;
	def fmodules_debuginfo :
	Flag<["-"], "fmodules-debuginfo">,
	HelpText<"Generate debug info for types in an object file built from this "
	"module and do not generate them elsewhere">;
	def fmodule_format_EQ : Joined<["-"], "fmodule-format=">,
	HelpText<"Select the container format for clang modules and PCH. "
	"Supported options are 'raw' and 'obj'.">;
	def ftest_module_file_extension_EQ :
	Joined<["-"], "ftest-module-file-extension=">,
	HelpText<"introduce a module file extension for testing purposes. "
	"The argument is parsed as blockname:major:minor:hashed:user info">;
	def fconcepts_ts : Flag<["-"], "fconcepts-ts">,
	HelpText<"Enable C++ Extensions for Concepts. (deprecated - use -std=c++2a)">;
	def fno_concept_satisfaction_caching : Flag<["-"],
	"fno-concept-satisfaction-caching">,
	HelpText<"Disable satisfaction caching for C++2a Concepts.">;

	def frecovery_ast : Flag<["-"], "frecovery-ast">,
	HelpText<"Preserve expressions in AST rather than dropping them when "
	"encountering semantic errors">;
	def fno_recovery_ast : Flag<["-"], "fno-recovery-ast">;
	def frecovery_ast_type : Flag<["-"], "frecovery-ast-type">,
	HelpText<"Preserve the type for recovery expressions when possible "
	"(experimental)">;
	def fno_recovery_ast_type : Flag<["-"], "fno-recovery-ast-type">;

	let Group = Action_Group in {

	def Eonly : Flag<["-"], "Eonly">,
	HelpText<"Just run preprocessor, no output (for timings)">;
	def dump_raw_tokens : Flag<["-"], "dump-raw-tokens">,
	HelpText<"Lex file in raw mode and dump raw tokens">;
	def analyze : Flag<["-"], "analyze">,
	HelpText<"Run static analysis engine">;
	def dump_tokens : Flag<["-"], "dump-tokens">,
	HelpText<"Run preprocessor, dump internal rep of tokens">;
	def init_only : Flag<["-"], "init-only">,
	HelpText<"Only execute frontend initialization">;
	def fixit : Flag<["-"], "fixit">,
	HelpText<"Apply fix-it advice to the input source">;
	def fixit_EQ : Joined<["-"], "fixit=">,
	HelpText<"Apply fix-it advice creating a file with the given suffix">;
	def print_preamble : Flag<["-"], "print-preamble">,
	HelpText<"Print the \"preamble\" of a file, which is a candidate for implicit"
	" precompiled headers.">;
	def emit_html : Flag<["-"], "emit-html">,
	HelpText<"Output input source as HTML">;
	def ast_print : Flag<["-"], "ast-print">,
	HelpText<"Build ASTs and then pretty-print them">;
	def ast_list : Flag<["-"], "ast-list">,
	HelpText<"Build ASTs and print the list of declaration node qualified names">;
	def ast_dump : Flag<["-"], "ast-dump">,
	HelpText<"Build ASTs and then debug dump them">;
	def ast_dump_EQ : Joined<["-"], "ast-dump=">,
	HelpText<"Build ASTs and then debug dump them in the specified format. "
	"Supported formats include: default, json">;
	def ast_dump_all : Flag<["-"], "ast-dump-all">,
	HelpText<"Build ASTs and then debug dump them, forcing deserialization">;
	def ast_dump_all_EQ : Joined<["-"], "ast-dump-all=">,
	HelpText<"Build ASTs and then debug dump them in the specified format, "
	"forcing deserialization. Supported formats include: default, json">;
	def ast_dump_decl_types : Flag<["-"], "ast-dump-decl-types">,
	HelpText<"Include declaration types in AST dumps">;
	def templight_dump : Flag<["-"], "templight-dump">,
	HelpText<"Dump templight information to stdout">;
	def ast_dump_lookups : Flag<["-"], "ast-dump-lookups">,
	HelpText<"Build ASTs and then debug dump their name lookup tables">;
	def ast_view : Flag<["-"], "ast-view">,
	HelpText<"Build ASTs and view them with GraphViz">;
	def emit_module : Flag<["-"], "emit-module">,
	HelpText<"Generate pre-compiled module file from a module map">;
	def emit_module_interface : Flag<["-"], "emit-module-interface">,
	HelpText<"Generate pre-compiled module file from a C++ module interface">;
	def emit_header_module : Flag<["-"], "emit-header-module">,
	HelpText<"Generate pre-compiled module file from a set of header files">;
	def emit_pch : Flag<["-"], "emit-pch">,
	HelpText<"Generate pre-compiled header file">;
	def emit_llvm_bc : Flag<["-"], "emit-llvm-bc">,
	HelpText<"Build ASTs then convert to LLVM, emit .bc file">;
	def emit_llvm_only : Flag<["-"], "emit-llvm-only">,
	HelpText<"Build ASTs and convert to LLVM, discarding output">;
	def emit_codegen_only : Flag<["-"], "emit-codegen-only">,
	HelpText<"Generate machine code, but discard output">;
	def emit_obj : Flag<["-"], "emit-obj">,
	HelpText<"Emit native object files">;
	def rewrite_test : Flag<["-"], "rewrite-test">,
	HelpText<"Rewriter playground">;
	def rewrite_macros : Flag<["-"], "rewrite-macros">,
	HelpText<"Expand macros without full preprocessing">;
	def migrate : Flag<["-"], "migrate">,
	HelpText<"Migrate source code">;
	def compiler_options_dump : Flag<["-"], "compiler-options-dump">,
	HelpText<"Dump the compiler configuration options">;
	def print_dependency_directives_minimized_source : Flag<["-"],
	"print-dependency-directives-minimized-source">,
	HelpText<"Print the output of the dependency directives source minimizer">;
	}

	def emit_llvm_uselists : Flag<["-"], "emit-llvm-uselists">,
	HelpText<"Preserve order of LLVM use-lists when serializing">;
	def no_emit_llvm_uselists : Flag<["-"], "no-emit-llvm-uselists">,
	HelpText<"Don't preserve order of LLVM use-lists when serializing">;

	def mt_migrate_directory : Separate<["-"], "mt-migrate-directory">,
	HelpText<"Directory for temporary files produced during ARC or ObjC migration">;
	def arcmt_check : Flag<["-"], "arcmt-check">,
	HelpText<"Check for ARC migration issues that need manual handling">;
	def arcmt_modify : Flag<["-"], "arcmt-modify">,
	HelpText<"Apply modifications to files to conform to ARC">;
	def arcmt_migrate : Flag<["-"], "arcmt-migrate">,
	HelpText<"Apply modifications and produces temporary files that conform to ARC">;

	def opt_record_file : Separate<["-"], "opt-record-file">,
	HelpText<"File name to use for YAML optimization record output">;
	def opt_record_passes : Separate<["-"], "opt-record-passes">,
	HelpText<"Only record remark information for passes whose names match the given regular expression">;
	def opt_record_format : Separate<["-"], "opt-record-format">,
	HelpText<"The format used for serializing remarks (default: YAML)">;

	def print_stats : Flag<["-"], "print-stats">,
	HelpText<"Print performance metrics and statistics">;
	def stats_file : Joined<["-"], "stats-file=">,
	HelpText<"Filename to write statistics to">;
	def fdump_record_layouts : Flag<["-"], "fdump-record-layouts">,
	HelpText<"Dump record layout information">;
	def fdump_record_layouts_simple : Flag<["-"], "fdump-record-layouts-simple">,
	HelpText<"Dump record layout information in a simple form used for testing">;
	def fix_what_you_can : Flag<["-"], "fix-what-you-can">,
	HelpText<"Apply fix-it advice even in the presence of unfixable errors">;
	def fix_only_warnings : Flag<["-"], "fix-only-warnings">,
	HelpText<"Apply fix-it advice only for warnings, not errors">;
	def fixit_recompile : Flag<["-"], "fixit-recompile">,
	HelpText<"Apply fix-it changes and recompile">;
	def fixit_to_temp : Flag<["-"], "fixit-to-temporary">,
	HelpText<"Apply fix-it changes to temporary files">;

	def foverride_record_layout_EQ : Joined<["-"], "foverride-record-layout=">,
	HelpText<"Override record layouts with those in the given file">;
	def pch_through_header_EQ : Joined<["-"], "pch-through-header=">,
	HelpText<"Stop PCH generation after including this file. When using a PCH, "
	"skip tokens until after this file is included.">;
	def pch_through_hdrstop_create : Flag<["-"], "pch-through-hdrstop-create">,
	HelpText<"When creating a PCH, stop PCH generation after #pragma hdrstop.">;
	def pch_through_hdrstop_use : Flag<["-"], "pch-through-hdrstop-use">,
	HelpText<"When using a PCH, skip tokens until after a #pragma hdrstop.">;
	def fno_pch_timestamp : Flag<["-"], "fno-pch-timestamp">,
	HelpText<"Disable inclusion of timestamp in precompiled headers">;
	def building_pch_with_obj : Flag<["-"], "building-pch-with-obj">,
	HelpText<"This compilation is part of building a PCH with corresponding object file.">;

	def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">,
	HelpText<"Aligned allocation/deallocation functions are unavailable">;

	//===----------------------------------------------------------------------===//
	// Language Options
	//===----------------------------------------------------------------------===//

	let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {

	def version : Flag<["-"], "version">,
	HelpText<"Print the compiler version">;
	def main_file_name : Separate<["-"], "main-file-name">,
	HelpText<"Main file name to use for debug info and source if missing">;
	def split_dwarf_output : Separate<["-"], "split-dwarf-output">,
	HelpText<"File name to use for split dwarf debug info output">;

	}

	def fblocks_runtime_optional : Flag<["-"], "fblocks-runtime-optional">,
	HelpText<"Weakly link in the blocks runtime">;
	def fexternc_nounwind : Flag<["-"], "fexternc-nounwind">,
	HelpText<"Assume all functions with C linkage do not unwind">;
	def split_dwarf_file : Separate<["-"], "split-dwarf-file">,
	HelpText<"Name of the split dwarf debug info file to encode in the object file">;
	def fno_wchar : Flag<["-"], "fno-wchar">,
	HelpText<"Disable C++ builtin type wchar_t">;
	def fconstant_string_class : Separate<["-"], "fconstant-string-class">,
	MetaVarName<"<class name>">,
	HelpText<"Specify the class to use for constant Objective-C string objects.">;
	def fobjc_arc_cxxlib_EQ : Joined<["-"], "fobjc-arc-cxxlib=">,
	HelpText<"Objective-C++ Automatic Reference Counting standard library kind">, Values<"libc++,libstdc++,none">;
	def fobjc_runtime_has_weak : Flag<["-"], "fobjc-runtime-has-weak">,
	HelpText<"The target Objective-C runtime supports ARC weak operations">;
	def fobjc_dispatch_method_EQ : Joined<["-"], "fobjc-dispatch-method=">,
	HelpText<"Objective-C dispatch method to use">, Values<"legacy,non-legacy,mixed">;
	def disable_objc_default_synthesize_properties : Flag<["-"], "disable-objc-default-synthesize-properties">,
	HelpText<"disable the default synthesis of Objective-C properties">;
	def fencode_extended_block_signature : Flag<["-"], "fencode-extended-block-signature">,
	HelpText<"enable extended encoding of block type signature">;
	def function_alignment : Separate<["-"], "function-alignment">,
	HelpText<"default alignment for functions">;
	def pic_level : Separate<["-"], "pic-level">,
	HelpText<"Value for __PIC__">;
	def pic_is_pie : Flag<["-"], "pic-is-pie">,
	HelpText<"File is for a position independent executable">;
	def fno_validate_pch : Flag<["-"], "fno-validate-pch">,
	HelpText<"Disable validation of precompiled headers">;
	def fallow_pch_with_errors : Flag<["-"], "fallow-pch-with-compiler-errors">,
	HelpText<"Accept a PCH file that was created with compiler errors">;
	def dump_deserialized_pch_decls : Flag<["-"], "dump-deserialized-decls">,
	HelpText<"Dump declarations that are deserialized from PCH, for testing">;
	def error_on_deserialized_pch_decl : Separate<["-"], "error-on-deserialized-decl">,
	HelpText<"Emit error if a specific declaration is deserialized from PCH, for testing">;
	def error_on_deserialized_pch_decl_EQ : Joined<["-"], "error-on-deserialized-decl=">,
	Alias<error_on_deserialized_pch_decl>;
	def static_define : Flag<["-"], "static-define">,
	HelpText<"Should __STATIC__ be defined">;
	def stack_protector : Separate<["-"], "stack-protector">,
	HelpText<"Enable stack protectors">;
	def stack_protector_buffer_size : Separate<["-"], "stack-protector-buffer-size">,
	HelpText<"Lower bound for a buffer to be considered for stack protection">;
	def fvisibility : Separate<["-"], "fvisibility">,
	HelpText<"Default type and symbol visibility">;
	def ftype_visibility : Separate<["-"], "ftype-visibility">,
	HelpText<"Default type visibility">;
	def fapply_global_visibility_to_externs : Flag<["-"], "fapply-global-visibility-to-externs">,
	HelpText<"Apply global symbol visibility to external declarations without an explicit visibility">;
	def ftemplate_depth : Separate<["-"], "ftemplate-depth">,
	HelpText<"Maximum depth of recursive template instantiation">;
	def foperator_arrow_depth : Separate<["-"], "foperator-arrow-depth">,
	HelpText<"Maximum number of 'operator->'s to call for a member access">;
	def fconstexpr_depth : Separate<["-"], "fconstexpr-depth">,
	HelpText<"Maximum depth of recursive constexpr function calls">;
	def fconstexpr_steps : Separate<["-"], "fconstexpr-steps">,
	HelpText<"Maximum number of steps in constexpr function evaluation">;
	def fbracket_depth : Separate<["-"], "fbracket-depth">,
	HelpText<"Maximum nesting level for parentheses, brackets, and braces">;
	def fconst_strings : Flag<["-"], "fconst-strings">,
	HelpText<"Use a const qualified type for string literals in C and ObjC">;
	def fno_const_strings : Flag<["-"], "fno-const-strings">,
	HelpText<"Don't use a const qualified type for string literals in C and ObjC">;
	def fno_bitfield_type_align : Flag<["-"], "fno-bitfield-type-align">,
	HelpText<"Ignore bit-field types when aligning structures">;
	def ffake_address_space_map : Flag<["-"], "ffake-address-space-map">,
	HelpText<"Use a fake address space map; OpenCL testing purposes only">;
	def faddress_space_map_mangling_EQ : Joined<["-"], "faddress-space-map-mangling=">, MetaVarName<"<yes\|no\|target>">,
	HelpText<"Set the mode for address space map based mangling; OpenCL testing purposes only">;
	def funknown_anytype : Flag<["-"], "funknown-anytype">,
	HelpText<"Enable parser support for the __unknown_anytype type; for testing purposes only">;
	def fdebugger_support : Flag<["-"], "fdebugger-support">,
	HelpText<"Enable special debugger support behavior">;
	def fdebugger_cast_result_to_id : Flag<["-"], "fdebugger-cast-result-to-id">,
	HelpText<"Enable casting unknown expression results to id">;
	def fdebugger_objc_literal : Flag<["-"], "fdebugger-objc-literal">,
	HelpText<"Enable special debugger support for Objective-C subscripting and literals">;
	def fdeprecated_macro : Flag<["-"], "fdeprecated-macro">,
	HelpText<"Defines the __DEPRECATED macro">;
	def fno_deprecated_macro : Flag<["-"], "fno-deprecated-macro">,
	HelpText<"Undefines the __DEPRECATED macro">;
	def fobjc_subscripting_legacy_runtime : Flag<["-"], "fobjc-subscripting-legacy-runtime">,
	HelpText<"Allow Objective-C array and dictionary subscripting in legacy runtime">;
	def vtordisp_mode_EQ : Joined<["-"], "vtordisp-mode=">,
	HelpText<"Control vtordisp placement on win32 targets">;
	def fnative_half_type: Flag<["-"], "fnative-half-type">,
	HelpText<"Use the native half type for __fp16 instead of promoting to float">;
	def fnative_half_arguments_and_returns : Flag<["-"], "fnative-half-arguments-and-returns">,
	HelpText<"Use the native __fp16 type for arguments and returns (and skip ABI-specific lowering)">;
	def fallow_half_arguments_and_returns : Flag<["-"], "fallow-half-arguments-and-returns">,
	HelpText<"Allow function arguments and returns of type half">;
	def fdefault_calling_conv_EQ : Joined<["-"], "fdefault-calling-conv=">,
	HelpText<"Set default calling convention">, Values<"cdecl,fastcall,stdcall,vectorcall,regcall">;
	def finclude_default_header : Flag<["-"], "finclude-default-header">,
	HelpText<"Include default header file for OpenCL">;
	def fdeclare_opencl_builtins : Flag<["-"], "fdeclare-opencl-builtins">,
	HelpText<"Add OpenCL builtin function declarations (experimental)">;
	def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">,
	HelpText<"Preserve 3-component vector type">;
	def fwchar_type_EQ : Joined<["-"], "fwchar-type=">,
	HelpText<"Select underlying type for wchar_t">, Values<"char,short,int">;
	def fsigned_wchar : Flag<["-"], "fsigned-wchar">,
	HelpText<"Use a signed type for wchar_t">;
	def fno_signed_wchar : Flag<["-"], "fno-signed-wchar">,
	HelpText<"Use an unsigned type for wchar_t">;
	def fcompatibility_qualified_id_block_param_type_checking : Flag<["-"], "fcompatibility-qualified-id-block-type-checking">,
	HelpText<"Allow using blocks with parameters of more specific type than "
	"the type system guarantees when a parameter is qualified id">;

	// FIXME: Remove these entirely once functionality/tests have been excised.
	def fobjc_gc_only : Flag<["-"], "fobjc-gc-only">, Group<f_Group>,
	HelpText<"Use GC exclusively for Objective-C related memory management">;
	def fobjc_gc : Flag<["-"], "fobjc-gc">, Group<f_Group>,
	HelpText<"Enable Objective-C garbage collection">;

	//===----------------------------------------------------------------------===//
	// Header Search Options
	//===----------------------------------------------------------------------===//

	def nostdsysteminc : Flag<["-"], "nostdsysteminc">,
	HelpText<"Disable standard system #include directories">;
	def fdisable_module_hash : Flag<["-"], "fdisable-module-hash">,
	HelpText<"Disable the module hash">;
	def fmodules_hash_content : Flag<["-"], "fmodules-hash-content">,
	HelpText<"Enable hashing the content of a module file">;
	def fmodules_strict_context_hash : Flag<["-"], "fmodules-strict-context-hash">,
	HelpText<"Enable hashing of all compiler options that could impact the "
	"semantics of a module in an implicit build">,
	MarshallingInfoFlag<"HeaderSearchOpts->ModulesStrictContextHash", "false">;
	def c_isystem : JoinedOrSeparate<["-"], "c-isystem">, MetaVarName<"<directory>">,
	HelpText<"Add directory to the C SYSTEM include search path">;
	def objc_isystem : JoinedOrSeparate<["-"], "objc-isystem">,
	MetaVarName<"<directory>">,
	HelpText<"Add directory to the ObjC SYSTEM include search path">;
	def objcxx_isystem : JoinedOrSeparate<["-"], "objcxx-isystem">,
	MetaVarName<"<directory>">,
	HelpText<"Add directory to the ObjC++ SYSTEM include search path">;
	def internal_isystem : JoinedOrSeparate<["-"], "internal-isystem">,
	MetaVarName<"<directory>">,
	HelpText<"Add directory to the internal system include search path; these "
	"are assumed to not be user-provided and are used to model system "
	"and standard headers' paths.">;
	def internal_externc_isystem : JoinedOrSeparate<["-"], "internal-externc-isystem">,
	MetaVarName<"<directory>">,
	HelpText<"Add directory to the internal system include search path with "
	"implicit extern \"C\" semantics; these are assumed to not be "
	"user-provided and are used to model system and standard headers' "
	"paths.">;

	//===----------------------------------------------------------------------===//
	// Preprocessor Options
	//===----------------------------------------------------------------------===//

	def chain_include : Separate<["-"], "chain-include">, MetaVarName<"<file>">,
	HelpText<"Include and chain a header file after turning it into PCH">;
	def preamble_bytes_EQ : Joined<["-"], "preamble-bytes=">,
	HelpText<"Assume that the precompiled header is a precompiled preamble "
	"covering the first N bytes of the main file">;
	def detailed_preprocessing_record : Flag<["-"], "detailed-preprocessing-record">,
	HelpText<"include a detailed record of preprocessing actions">;
	def setup_static_analyzer : Flag<["-"], "setup-static-analyzer">,
	HelpText<"Set up preprocessor for static analyzer (done automatically when static analyzer is run).">;
	def disable_pragma_debug_crash : Flag<["-"], "disable-pragma-debug-crash">,
	HelpText<"Disable any #pragma clang __debug that can lead to crashing behavior. This is meant for testing.">;

	//===----------------------------------------------------------------------===//
	// OpenCL Options
	//===----------------------------------------------------------------------===//

	def cl_ext_EQ : CommaJoined<["-"], "cl-ext=">,
	HelpText<"OpenCL only. Enable or disable OpenCL extensions. The argument is a comma-separated sequence of one or more extension names, each prefixed by '+' or '-'.">;

	//===----------------------------------------------------------------------===//
	// CUDA Options
	//===----------------------------------------------------------------------===//

	def fcuda_is_device : Flag<["-"], "fcuda-is-device">,
	HelpText<"Generate code for CUDA device">;
	def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
	HelpText<"Incorporate CUDA device-side binary into host object file.">;
	def fcuda_allow_variadic_functions : Flag<["-"], "fcuda-allow-variadic-functions">,
	HelpText<"Allow variadic functions in CUDA device code.">;
	def fno_cuda_host_device_constexpr : Flag<["-"], "fno-cuda-host-device-constexpr">,
	HelpText<"Don't treat unattributed constexpr functions as __host__ __device__.">;

	//===----------------------------------------------------------------------===//
	// OpenMP Options
	//===----------------------------------------------------------------------===//

	def fopenmp_is_device : Flag<["-"], "fopenmp-is-device">,
	HelpText<"Generate code only for an OpenMP target device.">;
	def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
	HelpText<"Path to the IR file produced by the frontend for the host.">;

	//===----------------------------------------------------------------------===//
	// SYCL Options
	//===----------------------------------------------------------------------===//

	def fsycl_is_device : Flag<["-"], "fsycl-is-device">,
	HelpText<"Generate code for SYCL device.">;

	} // let Flags = [CC1Option]

	//===----------------------------------------------------------------------===//
	// cc1as-only Options
	//===----------------------------------------------------------------------===//

	let Flags = [CC1AsOption, NoDriverOption] in {

	// Language Options
	def n : Flag<["-"], "n">,
	HelpText<"Don't automatically start assembly file with a text section">;

	// Frontend Options
	def filetype : Separate<["-"], "filetype">,
	HelpText<"Specify the output file type ('asm', 'null', or 'obj')">;

	// Transliterate Options
	def output_asm_variant : Separate<["-"], "output-asm-variant">,
	HelpText<"Select the asm variant index to use for output">;
	def show_encoding : Flag<["-"], "show-encoding">,
	HelpText<"Show instruction encoding information in transliterate mode">;
	def show_inst : Flag<["-"], "show-inst">,
	HelpText<"Show internal instruction representation in transliterate mode">;

	// Assemble Options
	def dwarf_debug_producer : Separate<["-"], "dwarf-debug-producer">,
	HelpText<"The string to embed in the Dwarf debug AT_producer record.">;

	def defsym : Separate<["-"], "defsym">,
	HelpText<"Define a value for a symbol">;

	} // let Flags = [CC1AsOption]

	//===----------------------------------------------------------------------===//
	// clang-cl Options
	//===----------------------------------------------------------------------===//

	def cl_Group : OptionGroup<"<clang-cl options>">, Flags<[CLOption]>,
	HelpText<"CL.EXE COMPATIBILITY OPTIONS">;

	def cl_compile_Group : OptionGroup<"<clang-cl compile-only options>">,
	Group<cl_Group>;

	def cl_ignored_Group : OptionGroup<"<clang-cl ignored options>">,
	Group<cl_Group>;

	class CLFlag<string name> : Option<["/", "-"], name, KIND_FLAG>,
	Group<cl_Group>, Flags<[CLOption, DriverOption]>;

	class CLCompileFlag<string name> : Option<["/", "-"], name, KIND_FLAG>,
	Group<cl_compile_Group>, Flags<[CLOption, DriverOption]>;

	class CLIgnoredFlag<string name> : Option<["/", "-"], name, KIND_FLAG>,
	Group<cl_ignored_Group>, Flags<[CLOption, DriverOption]>;

	class CLJoined<string name> : Option<["/", "-"], name, KIND_JOINED>,
	Group<cl_Group>, Flags<[CLOption, DriverOption]>;

	class CLCompileJoined<string name> : Option<["/", "-"], name, KIND_JOINED>,
	Group<cl_compile_Group>, Flags<[CLOption, DriverOption]>;

	class CLIgnoredJoined<string name> : Option<["/", "-"], name, KIND_JOINED>,
	Group<cl_ignored_Group>, Flags<[CLOption, DriverOption, HelpHidden]>;

	class CLJoinedOrSeparate<string name> : Option<["/", "-"], name,
	KIND_JOINED_OR_SEPARATE>, Group<cl_Group>, Flags<[CLOption, DriverOption]>;

	class CLCompileJoinedOrSeparate<string name> : Option<["/", "-"], name,
	KIND_JOINED_OR_SEPARATE>, Group<cl_compile_Group>,
	Flags<[CLOption, DriverOption]>;

	class CLRemainingArgsJoined<string name> : Option<["/", "-"], name,
	KIND_REMAINING_ARGS_JOINED>, Group<cl_Group>, Flags<[CLOption, DriverOption]>;

	// Aliases:
	// (We don't put any of these in cl_compile_Group as the options they alias are
	// already in the right group.)

	def _SLASH_Brepro : CLFlag<"Brepro">,
	HelpText<"Do not write current time into COFF output (breaks link.exe /incremental)">,
	Alias<mno_incremental_linker_compatible>;
	def _SLASH_Brepro_ : CLFlag<"Brepro-">,
	HelpText<"Write current time into COFF output (default)">,
	Alias<mincremental_linker_compatible>;
	def _SLASH_C : CLFlag<"C">,
	HelpText<"Do not discard comments when preprocessing">, Alias<C>;
	def _SLASH_c : CLFlag<"c">, HelpText<"Compile only">, Alias<c>;
	def _SLASH_d1PP : CLFlag<"d1PP">,
	HelpText<"Retain macro definitions in /E mode">, Alias<dD>;
	def _SLASH_d1reportAllClassLayout : CLFlag<"d1reportAllClassLayout">,
	HelpText<"Dump record layout information">,
	Alias<Xclang>, AliasArgs<["-fdump-record-layouts"]>;
	def _SLASH_diagnostics_caret : CLFlag<"diagnostics:caret">,
	HelpText<"Enable caret and column diagnostics (default)">;
	def _SLASH_diagnostics_column : CLFlag<"diagnostics:column">,
	HelpText<"Disable caret diagnostics but keep column info">;
	def _SLASH_diagnostics_classic : CLFlag<"diagnostics:classic">,
	HelpText<"Disable column and caret diagnostics">;
	def _SLASH_D : CLJoinedOrSeparate<"D">, HelpText<"Define macro">,
	MetaVarName<"<macro[=value]>">, Alias<D>;
	def _SLASH_E : CLFlag<"E">, HelpText<"Preprocess to stdout">, Alias<E>;
	def _SLASH_fp_except : CLFlag<"fp:except">, HelpText<"">, Alias<ftrapping_math>;
	def _SLASH_fp_except_ : CLFlag<"fp:except-">,
	HelpText<"">, Alias<fno_trapping_math>;
	def _SLASH_fp_fast : CLFlag<"fp:fast">, HelpText<"">, Alias<ffast_math>;
	def _SLASH_fp_precise : CLFlag<"fp:precise">,
	HelpText<"">, Alias<fno_fast_math>;
	def _SLASH_fp_strict : CLFlag<"fp:strict">, HelpText<"">, Alias<fno_fast_math>;
	def _SLASH_GA : CLFlag<"GA">, Alias<ftlsmodel_EQ>, AliasArgs<["local-exec"]>,
	HelpText<"Assume thread-local variables are defined in the executable">;
	def _SLASH_GR : CLFlag<"GR">, HelpText<"Emit RTTI data (default)">;
	def _SLASH_GR_ : CLFlag<"GR-">, HelpText<"Do not emit RTTI data">;
	def _SLASH_GF : CLIgnoredFlag<"GF">,
	HelpText<"Enable string pooling (default)">;
	def _SLASH_GF_ : CLFlag<"GF-">, HelpText<"Disable string pooling">,
	Alias<fwritable_strings>;
	def _SLASH_GS : CLFlag<"GS">,
	HelpText<"Enable buffer security check (default)">;
	def _SLASH_GS_ : CLFlag<"GS-">, HelpText<"Disable buffer security check">;
	def : CLFlag<"Gs">, HelpText<"Use stack probes (default)">,
	Alias<mstack_probe_size>, AliasArgs<["4096"]>;
	def _SLASH_Gs : CLJoined<"Gs">,
	HelpText<"Set stack probe size (default 4096)">, Alias<mstack_probe_size>;
	def _SLASH_Gy : CLFlag<"Gy">, HelpText<"Put each function in its own section">,
	Alias<ffunction_sections>;
	def _SLASH_Gy_ : CLFlag<"Gy-">,
	HelpText<"Do not put each function in its own section (default)">,
	Alias<fno_function_sections>;
	def _SLASH_Gw : CLFlag<"Gw">, HelpText<"Put each data item in its own section">,
	Alias<fdata_sections>;
	def _SLASH_Gw_ : CLFlag<"Gw-">,
	HelpText<"Do not put each data item in its own section (default)">,
	Alias<fno_data_sections>;
	def _SLASH_help : CLFlag<"help">, Alias<help>,
	HelpText<"Display available options">;
	def _SLASH_HELP : CLFlag<"HELP">, Alias<help>;
	def _SLASH_I : CLJoinedOrSeparate<"I">,
	HelpText<"Add directory to include search path">, MetaVarName<"<dir>">,
	Alias<I>;
	def _SLASH_J : CLFlag<"J">, HelpText<"Make char type unsigned">,
	Alias<funsigned_char>;

	// The _SLASH_O option handles all the /O flags, but we also provide separate
	// aliased options to provide separate help messages.
	def _SLASH_O : CLJoined<"O">,
	HelpText<"Set multiple /O flags at once; e.g. '/O2y-' for '/O2 /Oy-'">,
	MetaVarName<"<flags>">;
	def : CLFlag<"O1">, Alias<_SLASH_O>, AliasArgs<["1"]>,
	HelpText<"Optimize for size (like /Og /Os /Oy /Ob2 /GF /Gy)">;
	def : CLFlag<"O2">, Alias<_SLASH_O>, AliasArgs<["2"]>,
	HelpText<"Optimize for speed (like /Og /Oi /Ot /Oy /Ob2 /GF /Gy)">;
	def : CLFlag<"Ob0">, Alias<_SLASH_O>, AliasArgs<["b0"]>,
	HelpText<"Disable function inlining">;
	def : CLFlag<"Ob1">, Alias<_SLASH_O>, AliasArgs<["b1"]>,
	HelpText<"Only inline functions explicitly or implicitly marked inline">;
	def : CLFlag<"Ob2">, Alias<_SLASH_O>, AliasArgs<["b2"]>,
	HelpText<"Inline functions as deemed beneficial by the compiler">;
	def : CLFlag<"Od">, Alias<_SLASH_O>, AliasArgs<["d"]>,
	HelpText<"Disable optimization">;
	def : CLFlag<"Og">, Alias<_SLASH_O>, AliasArgs<["g"]>,
	HelpText<"No effect">;
	def : CLFlag<"Oi">, Alias<_SLASH_O>, AliasArgs<["i"]>,
	HelpText<"Enable use of builtin functions">;
	def : CLFlag<"Oi-">, Alias<_SLASH_O>, AliasArgs<["i-"]>,
	HelpText<"Disable use of builtin functions">;
	def : CLFlag<"Os">, Alias<_SLASH_O>, AliasArgs<["s"]>,
	HelpText<"Optimize for size">;
	def : CLFlag<"Ot">, Alias<_SLASH_O>, AliasArgs<["t"]>,
	HelpText<"Optimize for speed">;
	def : CLFlag<"Ox">, Alias<_SLASH_O>, AliasArgs<["x"]>,
	HelpText<"Deprecated (like /Og /Oi /Ot /Oy /Ob2); use /O2">;
	def : CLFlag<"Oy">, Alias<_SLASH_O>, AliasArgs<["y"]>,
	HelpText<"Enable frame pointer omission (x86 only)">;
	def : CLFlag<"Oy-">, Alias<_SLASH_O>, AliasArgs<["y-"]>,
	HelpText<"Disable frame pointer omission (x86 only, default)">;

	def _SLASH_QUESTION : CLFlag<"?">, Alias<help>,
	HelpText<"Display available options">;
	def _SLASH_Qvec : CLFlag<"Qvec">,
	HelpText<"Enable the loop vectorization passes">, Alias<fvectorize>;
	def _SLASH_Qvec_ : CLFlag<"Qvec-">,
	HelpText<"Disable the loop vectorization passes">, Alias<fno_vectorize>;
	def _SLASH_showIncludes : CLFlag<"showIncludes">,
	HelpText<"Print info about included files to stderr">;
	def _SLASH_showIncludes_user : CLFlag<"showIncludes:user">,
	HelpText<"Like /showIncludes but omit system headers">;
	def _SLASH_showFilenames : CLFlag<"showFilenames">,
	HelpText<"Print the name of each compiled file">;
	def _SLASH_showFilenames_ : CLFlag<"showFilenames-">,
	HelpText<"Do not print the name of each compiled file (default)">;
	def _SLASH_source_charset : CLCompileJoined<"source-charset:">,
	HelpText<"Set source encoding, supports only UTF-8">,
	Alias<finput_charset_EQ>;
	def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">,
	HelpText<"Set runtime encoding, supports only UTF-8">,
	Alias<fexec_charset_EQ>;
	def _SLASH_std : CLCompileJoined<"std:">,
	HelpText<"Set C++ version (c++14,c++17,c++latest)">;
	def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">,
	MetaVarName<"<macro>">, Alias<U>;
	def _SLASH_validate_charset : CLFlag<"validate-charset">,
	Alias<W_Joined>, AliasArgs<["invalid-source-encoding"]>;
	def _SLASH_validate_charset_ : CLFlag<"validate-charset-">,
	Alias<W_Joined>, AliasArgs<["no-invalid-source-encoding"]>;
	def _SLASH_W0 : CLFlag<"W0">, HelpText<"Disable all warnings">, Alias<w>;
	def _SLASH_W1 : CLFlag<"W1">, HelpText<"Enable -Wall">, Alias<Wall>;
	def _SLASH_W2 : CLFlag<"W2">, HelpText<"Enable -Wall">, Alias<Wall>;
	def _SLASH_W3 : CLFlag<"W3">, HelpText<"Enable -Wall">, Alias<Wall>;
	def _SLASH_W4 : CLFlag<"W4">, HelpText<"Enable -Wall and -Wextra">, Alias<WCL4>;
	def _SLASH_Wall : CLFlag<"Wall">, HelpText<"Enable -Weverything">,
	Alias<W_Joined>, AliasArgs<["everything"]>;
	def _SLASH_WX : CLFlag<"WX">, HelpText<"Treat warnings as errors">,
	Alias<W_Joined>, AliasArgs<["error"]>;
	def _SLASH_WX_ : CLFlag<"WX-">,
	HelpText<"Do not treat warnings as errors (default)">,
	Alias<W_Joined>, AliasArgs<["no-error"]>;
	def _SLASH_w_flag : CLFlag<"w">, HelpText<"Disable all warnings">, Alias<w>;
	def _SLASH_wd4005 : CLFlag<"wd4005">, Alias<W_Joined>,
	AliasArgs<["no-macro-redefined"]>;
	def _SLASH_wd4018 : CLFlag<"wd4018">, Alias<W_Joined>,
	AliasArgs<["no-sign-compare"]>;
	def _SLASH_wd4100 : CLFlag<"wd4100">, Alias<W_Joined>,
	AliasArgs<["no-unused-parameter"]>;
	def _SLASH_wd4910 : CLFlag<"wd4910">, Alias<W_Joined>,
	AliasArgs<["no-dllexport-explicit-instantiation-decl"]>;
	def _SLASH_wd4996 : CLFlag<"wd4996">, Alias<W_Joined>,
	AliasArgs<["no-deprecated-declarations"]>;
	def _SLASH_vd : CLJoined<"vd">, HelpText<"Control vtordisp placement">,
	Alias<vtordisp_mode_EQ>;
	def _SLASH_X : CLFlag<"X">,
	HelpText<"Do not add %INCLUDE% to include search path">, Alias<nostdlibinc>;
	def _SLASH_Zc_sizedDealloc : CLFlag<"Zc:sizedDealloc">,
	HelpText<"Enable C++14 sized global deallocation functions">,
	Alias<fsized_deallocation>;
	def _SLASH_Zc_sizedDealloc_ : CLFlag<"Zc:sizedDealloc-">,
	HelpText<"Disable C++14 sized global deallocation functions">,
	Alias<fno_sized_deallocation>;
	def _SLASH_Zc_alignedNew : CLFlag<"Zc:alignedNew">,
	HelpText<"Enable C++17 aligned allocation functions">,
	Alias<faligned_allocation>;
	def _SLASH_Zc_alignedNew_ : CLFlag<"Zc:alignedNew-">,
	HelpText<"Disable C++17 aligned allocation functions">,
	Alias<fno_aligned_allocation>;
	def _SLASH_Zc_char8_t : CLFlag<"Zc:char8_t">,
	HelpText<"Enable char8_t from C++2a">,
	Alias<fchar8__t>;
	def _SLASH_Zc_char8_t_ : CLFlag<"Zc:char8_t-">,
	HelpText<"Disable char8_t from c++2a">,
	Alias<fno_char8__t>;
	def _SLASH_Zc_strictStrings : CLFlag<"Zc:strictStrings">,
	HelpText<"Treat string literals as const">, Alias<W_Joined>,
	AliasArgs<["error=c++11-compat-deprecated-writable-strings"]>;
	def _SLASH_Zc_threadSafeInit : CLFlag<"Zc:threadSafeInit">,
	HelpText<"Enable thread-safe initialization of static variables">,
	Alias<fthreadsafe_statics>;
	def _SLASH_Zc_threadSafeInit_ : CLFlag<"Zc:threadSafeInit-">,
	HelpText<"Disable thread-safe initialization of static variables">,
	Alias<fno_threadsafe_statics>;
	def _SLASH_Zc_trigraphs : CLFlag<"Zc:trigraphs">,
	HelpText<"Enable trigraphs">, Alias<ftrigraphs>;
	def _SLASH_Zc_trigraphs_off : CLFlag<"Zc:trigraphs-">,
	HelpText<"Disable trigraphs (default)">, Alias<fno_trigraphs>;
	def _SLASH_Zc_twoPhase : CLFlag<"Zc:twoPhase">,
	HelpText<"Enable two-phase name lookup in templates">,
	Alias<fno_delayed_template_parsing>;
	def _SLASH_Zc_twoPhase_ : CLFlag<"Zc:twoPhase-">,
	HelpText<"Disable two-phase name lookup in templates (default)">,
	Alias<fdelayed_template_parsing>;
	def _SLASH_Z7 : CLFlag<"Z7">,
	HelpText<"Enable CodeView debug information in object files">;
	def _SLASH_Zd : CLFlag<"Zd">,
	HelpText<"Emit debug line number tables only">;
	def _SLASH_Zi : CLFlag<"Zi">, Alias<_SLASH_Z7>,
	HelpText<"Like /Z7">;
	def _SLASH_Zp : CLJoined<"Zp">,
	HelpText<"Set default maximum struct packing alignment">,
	Alias<fpack_struct_EQ>;
	def _SLASH_Zp_flag : CLFlag<"Zp">,
	HelpText<"Set default maximum struct packing alignment to 1">,
	Alias<fpack_struct_EQ>, AliasArgs<["1"]>;
	def _SLASH_Zs : CLFlag<"Zs">, HelpText<"Syntax-check only">,
	Alias<fsyntax_only>;
	def _SLASH_openmp_ : CLFlag<"openmp-">,
	HelpText<"Disable OpenMP support">, Alias<fno_openmp>;
	def _SLASH_openmp : CLFlag<"openmp">, HelpText<"Enable OpenMP support">,
	Alias<fopenmp>;
	def _SLASH_openmp_experimental : CLFlag<"openmp:experimental">,
	HelpText<"Enable OpenMP support with experimental SIMD support">,
	Alias<fopenmp>;

	// Non-aliases:

	def _SLASH_arch : CLCompileJoined<"arch:">,
	HelpText<"Set architecture for code generation">;

	def _SLASH_M_Group : OptionGroup<"</M group>">, Group<cl_compile_Group>;
	def _SLASH_volatile_Group : OptionGroup<"</volatile group>">,
	Group<cl_compile_Group>;

	def _SLASH_EH : CLJoined<"EH">, HelpText<"Set exception handling model">;
	def _SLASH_EP : CLFlag<"EP">,
	HelpText<"Disable linemarker output and preprocess to stdout">;
	def _SLASH_FA : CLFlag<"FA">,
	HelpText<"Output assembly code file during compilation">;
	def _SLASH_Fa : CLJoined<"Fa">,
	HelpText<"Set assembly output file name (with /FA)">,
	MetaVarName<"<file or dir/>">;
	def _SLASH_fallback : CLCompileFlag<"fallback">,
	HelpText<"Fall back to cl.exe if clang-cl fails to compile">;
	def _SLASH_FI : CLJoinedOrSeparate<"FI">,
	HelpText<"Include file before parsing">, Alias<include_>;
	def _SLASH_Fe : CLJoined<"Fe">,
	HelpText<"Set output executable file name">,
	MetaVarName<"<file or dir/>">;
	def _SLASH_Fi : CLCompileJoined<"Fi">,
	HelpText<"Set preprocess output file name (with /P)">,
	MetaVarName<"<file>">;
	def _SLASH_Fo : CLCompileJoined<"Fo">,
	HelpText<"Set output object file (with /c)">,
	MetaVarName<"<file or dir/>">;
	def _SLASH_guard : CLJoined<"guard:">,
	HelpText<"Enable Control Flow Guard with /guard:cf, or only the table with /guard:cf,nochecks">;
	def _SLASH_GX : CLFlag<"GX">,
	HelpText<"Deprecated; use /EHsc">;
	def _SLASH_GX_ : CLFlag<"GX-">,
	HelpText<"Deprecated (like not passing /EH)">;
	def _SLASH_imsvc : CLJoinedOrSeparate<"imsvc">,
	HelpText<"Add <dir> to system include search path, as if in %INCLUDE%">,
	MetaVarName<"<dir>">;
	def _SLASH_LD : CLFlag<"LD">, HelpText<"Create DLL">;
	def _SLASH_LDd : CLFlag<"LDd">, HelpText<"Create debug DLL">;
	def _SLASH_link : CLRemainingArgsJoined<"link">,
	HelpText<"Forward options to the linker">, MetaVarName<"<options>">;
	def _SLASH_MD : Option<["/", "-"], "MD", KIND_FLAG>, Group<_SLASH_M_Group>,
	Flags<[CLOption, DriverOption]>, HelpText<"Use DLL run-time">;
	def _SLASH_MDd : Option<["/", "-"], "MDd", KIND_FLAG>, Group<_SLASH_M_Group>,
	Flags<[CLOption, DriverOption]>, HelpText<"Use DLL debug run-time">;
	def _SLASH_MT : Option<["/", "-"], "MT", KIND_FLAG>, Group<_SLASH_M_Group>,
	Flags<[CLOption, DriverOption]>, HelpText<"Use static run-time">;
	def _SLASH_MTd : Option<["/", "-"], "MTd", KIND_FLAG>, Group<_SLASH_M_Group>,
	Flags<[CLOption, DriverOption]>, HelpText<"Use static debug run-time">;
	def _SLASH_o : CLJoinedOrSeparate<"o">,
	HelpText<"Deprecated (set output file name); use /Fe or /Fe">,
	MetaVarName<"<file or dir/>">;
	def _SLASH_P : CLFlag<"P">, HelpText<"Preprocess to file">;
	def _SLASH_Tc : CLCompileJoinedOrSeparate<"Tc">,
	HelpText<"Treat <file> as C source file">, MetaVarName<"<file>">;
	def _SLASH_TC : CLCompileFlag<"TC">, HelpText<"Treat all source files as C">;
	def _SLASH_Tp : CLCompileJoinedOrSeparate<"Tp">,
	HelpText<"Treat <file> as C++ source file">, MetaVarName<"<file>">;
	def _SLASH_TP : CLCompileFlag<"TP">, HelpText<"Treat all source files as C++">;
	def _SLASH_volatile_iso : Option<["/", "-"], "volatile:iso", KIND_FLAG>,
	Group<_SLASH_volatile_Group>, Flags<[CLOption, DriverOption]>,
	HelpText<"Volatile loads and stores have standard semantics">;
	def _SLASH_vmb : CLFlag<"vmb">,
	HelpText<"Use a best-case representation method for member pointers">;
	def _SLASH_vmg : CLFlag<"vmg">,
	HelpText<"Use a most-general representation for member pointers">;
	def _SLASH_vms : CLFlag<"vms">,
	HelpText<"Set the default most-general representation to single inheritance">;
	def _SLASH_vmm : CLFlag<"vmm">,
	HelpText<"Set the default most-general representation to "
	"multiple inheritance">;
	def _SLASH_vmv : CLFlag<"vmv">,
	HelpText<"Set the default most-general representation to "
	"virtual inheritance">;
	def _SLASH_volatile_ms : Option<["/", "-"], "volatile:ms", KIND_FLAG>,
	Group<_SLASH_volatile_Group>, Flags<[CLOption, DriverOption]>,
	HelpText<"Volatile loads and stores have acquire and release semantics">;
	def _SLASH_clang : CLJoined<"clang:">,
	HelpText<"Pass <arg> to the clang driver">, MetaVarName<"<arg>">;
	def _SLASH_Zl : CLFlag<"Zl">,
	HelpText<"Do not let object file auto-link default libraries">;

	def _SLASH_Yc : CLJoined<"Yc">,
	HelpText<"Generate a pch file for all code up to and including <filename>">,
	MetaVarName<"<filename>">;
	def _SLASH_Yu : CLJoined<"Yu">,
	HelpText<"Load a pch file and use it instead of all code up to "
	"and including <filename>">,
	MetaVarName<"<filename>">;
	def _SLASH_Y_ : CLFlag<"Y-">,
	HelpText<"Disable precompiled headers, overrides /Yc and /Yu">;
	def _SLASH_Zc_dllexportInlines : CLFlag<"Zc:dllexportInlines">,
	HelpText<"dllexport/dllimport inline member functions of dllexport/import classes (default)">;
	def _SLASH_Zc_dllexportInlines_ : CLFlag<"Zc:dllexportInlines-">,
	HelpText<"Do not dllexport/dllimport inline member functions of dllexport/import classes">;
	def _SLASH_Fp : CLJoined<"Fp">,
	HelpText<"Set pch file name (with /Yc and /Yu)">, MetaVarName<"<file>">;

	def _SLASH_Gd : CLFlag<"Gd">,
	HelpText<"Set __cdecl as a default calling convention">;
	def _SLASH_Gr : CLFlag<"Gr">,
	HelpText<"Set __fastcall as a default calling convention">;
	def _SLASH_Gz : CLFlag<"Gz">,
	HelpText<"Set __stdcall as a default calling convention">;
	def _SLASH_Gv : CLFlag<"Gv">,
	HelpText<"Set __vectorcall as a default calling convention">;
	def _SLASH_Gregcall : CLFlag<"Gregcall">,
	HelpText<"Set __regcall as a default calling convention">;

	// Ignored:

	def _SLASH_analyze_ : CLIgnoredFlag<"analyze-">;
	def _SLASH_bigobj : CLIgnoredFlag<"bigobj">;
	def _SLASH_cgthreads : CLIgnoredJoined<"cgthreads">;
	def _SLASH_d2FastFail : CLIgnoredFlag<"d2FastFail">;
	def _SLASH_d2Zi_PLUS : CLIgnoredFlag<"d2Zi+">;
	def _SLASH_errorReport : CLIgnoredJoined<"errorReport">;
	def _SLASH_FC : CLIgnoredFlag<"FC">;
	def _SLASH_Fd : CLIgnoredJoined<"Fd">;
	def _SLASH_FS : CLIgnoredFlag<"FS">;
	def _SLASH_JMC : CLIgnoredFlag<"JMC">;
	def _SLASH_kernel_ : CLIgnoredFlag<"kernel-">;
	def _SLASH_nologo : CLIgnoredFlag<"nologo">;
	def _SLASH_permissive_ : CLIgnoredFlag<"permissive-">;
	def _SLASH_RTC : CLIgnoredJoined<"RTC">;
	def _SLASH_sdl : CLIgnoredFlag<"sdl">;
	def _SLASH_sdl_ : CLIgnoredFlag<"sdl-">;
	def _SLASH_utf8 : CLIgnoredFlag<"utf-8">,
	HelpText<"Set source and runtime encoding to UTF-8 (default)">;
	def _SLASH_w : CLIgnoredJoined<"w">;
	def _SLASH_Zc___cplusplus : CLIgnoredFlag<"Zc:__cplusplus">;
	def _SLASH_Zc_auto : CLIgnoredFlag<"Zc:auto">;
	def _SLASH_Zc_forScope : CLIgnoredFlag<"Zc:forScope">;
	def _SLASH_Zc_inline : CLIgnoredFlag<"Zc:inline">;
	def _SLASH_Zc_rvalueCast : CLIgnoredFlag<"Zc:rvalueCast">;
	def _SLASH_Zc_ternary : CLIgnoredFlag<"Zc:ternary">;
	def _SLASH_Zc_wchar_t : CLIgnoredFlag<"Zc:wchar_t">;
	def _SLASH_ZH_MD5 : CLIgnoredFlag<"ZH:MD5">;
	def _SLASH_ZH_SHA1 : CLIgnoredFlag<"ZH:SHA1">;
	def _SLASH_ZH_SHA_256 : CLIgnoredFlag<"ZH:SHA_256">;
	def _SLASH_Zm : CLIgnoredJoined<"Zm">;
	def _SLASH_Zo : CLIgnoredFlag<"Zo">;
	def _SLASH_Zo_ : CLIgnoredFlag<"Zo-">;


	// Unsupported:

	def _SLASH_await : CLFlag<"await">;
	def _SLASH_constexpr : CLJoined<"constexpr:">;
	def _SLASH_AI : CLJoinedOrSeparate<"AI">;
	def _SLASH_Bt : CLFlag<"Bt">;
	def _SLASH_Bt_plus : CLFlag<"Bt+">;
	def _SLASH_clr : CLJoined<"clr">;
	def _SLASH_d2 : CLJoined<"d2">;
	def _SLASH_doc : CLJoined<"doc">;
	def _SLASH_FA_joined : CLJoined<"FA">;
	def _SLASH_favor : CLJoined<"favor">;
	def _SLASH_F : CLJoinedOrSeparate<"F">;
	def _SLASH_Fm : CLJoined<"Fm">;
	def _SLASH_Fr : CLJoined<"Fr">;
	def _SLASH_FR : CLJoined<"FR">;
	def _SLASH_FU : CLJoinedOrSeparate<"FU">;
	def _SLASH_Fx : CLFlag<"Fx">;
	def _SLASH_G1 : CLFlag<"G1">;
	def _SLASH_G2 : CLFlag<"G2">;
	def _SLASH_Ge : CLFlag<"Ge">;
	def _SLASH_Gh : CLFlag<"Gh">;
	def _SLASH_GH : CLFlag<"GH">;
	def _SLASH_GL : CLFlag<"GL">;
	def _SLASH_GL_ : CLFlag<"GL-">;
	def _SLASH_Gm : CLFlag<"Gm">;
	def _SLASH_Gm_ : CLFlag<"Gm-">;
	def _SLASH_GT : CLFlag<"GT">;
	def _SLASH_GZ : CLFlag<"GZ">;
	def _SLASH_H : CLFlag<"H">;
	def _SLASH_homeparams : CLFlag<"homeparams">;
	def _SLASH_hotpatch : CLFlag<"hotpatch">;
	def _SLASH_kernel : CLFlag<"kernel">;
	def _SLASH_LN : CLFlag<"LN">;
	def _SLASH_MP : CLJoined<"MP">;
	def _SLASH_Qfast_transcendentals : CLFlag<"Qfast_transcendentals">;
	def _SLASH_QIfist : CLFlag<"QIfist">;
	def _SLASH_QIntel_jcc_erratum : CLFlag<"QIntel-jcc-erratum">;
	def _SLASH_Qimprecise_fwaits : CLFlag<"Qimprecise_fwaits">;
	def _SLASH_Qpar : CLFlag<"Qpar">;
	def _SLASH_Qpar_report : CLJoined<"Qpar-report">;
	def _SLASH_Qsafe_fp_loads : CLFlag<"Qsafe_fp_loads">;
	def _SLASH_Qspectre : CLFlag<"Qspectre">;
	def _SLASH_Qspectre_load : CLFlag<"Qspectre-load">;
	def _SLASH_Qspectre_load_cf : CLFlag<"Qspectre-load-cf">;
	def _SLASH_Qvec_report : CLJoined<"Qvec-report">;
	def _SLASH_u : CLFlag<"u">;
	def _SLASH_V : CLFlag<"V">;
	def _SLASH_WL : CLFlag<"WL">;
	def _SLASH_Wp64 : CLFlag<"Wp64">;
	def _SLASH_Yd : CLFlag<"Yd">;
	def _SLASH_Yl : CLJoined<"Yl">;
	def _SLASH_Za : CLFlag<"Za">;
	def _SLASH_Zc : CLJoined<"Zc:">;
	def _SLASH_Ze : CLFlag<"Ze">;
	def _SLASH_Zg : CLFlag<"Zg">;
	def _SLASH_ZI : CLFlag<"ZI">;
	def _SLASH_ZW : CLJoined<"ZW">;
	diff --git a/contrib/llvm-project/clang/lib/AST/ASTContext.cpp b/contrib/llvm-project/clang/lib/AST/ASTContext.cpp
	index 2ba643f12a82..e3798bb46e86 100644
	--- a/contrib/llvm-project/clang/lib/AST/ASTContext.cpp
	+++ b/contrib/llvm-project/clang/lib/AST/ASTContext.cpp
	@@ -1,11166 +1,11165 @@
	//===- ASTContext.cpp - Context to hold long-lived AST nodes --------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the ASTContext interface.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/ASTContext.h"
	#include "CXXABI.h"
	#include "Interp/Context.h"
	#include "clang/AST/APValue.h"
	#include "clang/AST/ASTConcept.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/ASTTypeTraits.h"
	#include "clang/AST/Attr.h"
	#include "clang/AST/AttrIterator.h"
	#include "clang/AST/CharUnits.h"
	#include "clang/AST/Comment.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/DeclBase.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclContextInternals.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclOpenMP.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/DeclarationName.h"
	#include "clang/AST/DependenceFlags.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/ExprConcepts.h"
	#include "clang/AST/ExternalASTSource.h"
	#include "clang/AST/Mangle.h"
	#include "clang/AST/MangleNumberingContext.h"
	#include "clang/AST/NestedNameSpecifier.h"
	#include "clang/AST/ParentMapContext.h"
	#include "clang/AST/RawCommentList.h"
	#include "clang/AST/RecordLayout.h"
	#include "clang/AST/Stmt.h"
	#include "clang/AST/TemplateBase.h"
	#include "clang/AST/TemplateName.h"
	#include "clang/AST/Type.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/AST/UnresolvedSet.h"
	#include "clang/AST/VTableBuilder.h"
	#include "clang/Basic/AddressSpaces.h"
	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/CommentOptions.h"
	#include "clang/Basic/ExceptionSpecificationType.h"
	#include "clang/Basic/FixedPoint.h"
	#include "clang/Basic/IdentifierTable.h"
	#include "clang/Basic/LLVM.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/Linkage.h"
	#include "clang/Basic/Module.h"
	#include "clang/Basic/ObjCRuntime.h"
	#include "clang/Basic/SanitizerBlacklist.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Basic/Specifiers.h"
	#include "clang/Basic/TargetCXXABI.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/XRayLists.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/PointerUnion.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/Capacity.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstdlib>
	#include <map>
	#include <memory>
	#include <string>
	#include <tuple>
	#include <utility>

	using namespace clang;

	enum FloatingRank {
	BFloat16Rank, Float16Rank, HalfRank, FloatRank, DoubleRank, LongDoubleRank, Float128Rank
	};

	/// \returns location that is relevant when searching for Doc comments related
	/// to \p D.
	static SourceLocation getDeclLocForCommentSearch(const Decl *D,
	SourceManager &SourceMgr) {
	assert(D);

	// User can not attach documentation to implicit declarations.
	if (D->isImplicit())
	return {};

	// User can not attach documentation to implicit instantiations.
	if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
	if (FD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return {};
	}

	if (const auto *VD = dyn_cast<VarDecl>(D)) {
	if (VD->isStaticDataMember() &&
	VD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return {};
	}

	if (const auto *CRD = dyn_cast<CXXRecordDecl>(D)) {
	if (CRD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return {};
	}

	if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
	TemplateSpecializationKind TSK = CTSD->getSpecializationKind();
	if (TSK == TSK_ImplicitInstantiation \|\|
	TSK == TSK_Undeclared)
	return {};
	}

	if (const auto *ED = dyn_cast<EnumDecl>(D)) {
	if (ED->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return {};
	}
	if (const auto *TD = dyn_cast<TagDecl>(D)) {
	// When tag declaration (but not definition!) is part of the
	// decl-specifier-seq of some other declaration, it doesn't get comment
	if (TD->isEmbeddedInDeclarator() && !TD->isCompleteDefinition())
	return {};
	}
	// TODO: handle comments for function parameters properly.
	if (isa<ParmVarDecl>(D))
	return {};

	// TODO: we could look up template parameter documentation in the template
	// documentation.
	if (isa<TemplateTypeParmDecl>(D) \|\|
	isa<NonTypeTemplateParmDecl>(D) \|\|
	isa<TemplateTemplateParmDecl>(D))
	return {};

	// Find declaration location.
	// For Objective-C declarations we generally don't expect to have multiple
	// declarators, thus use declaration starting location as the "declaration
	// location".
	// For all other declarations multiple declarators are used quite frequently,
	// so we use the location of the identifier as the "declaration location".
	if (isa<ObjCMethodDecl>(D) \|\| isa<ObjCContainerDecl>(D) \|\|
	isa<ObjCPropertyDecl>(D) \|\|
	isa<RedeclarableTemplateDecl>(D) \|\|
	isa<ClassTemplateSpecializationDecl>(D) \|\|
	// Allow association with Y across {} in `typedef struct X {} Y`.
	isa<TypedefDecl>(D))
	return D->getBeginLoc();
	else {
	const SourceLocation DeclLoc = D->getLocation();
	if (DeclLoc.isMacroID()) {
	if (isa<TypedefDecl>(D)) {
	// If location of the typedef name is in a macro, it is because being
	// declared via a macro. Try using declaration's starting location as
	// the "declaration location".
	return D->getBeginLoc();
	} else if (const auto *TD = dyn_cast<TagDecl>(D)) {
	// If location of the tag decl is inside a macro, but the spelling of
	// the tag name comes from a macro argument, it looks like a special
	// macro like NS_ENUM is being used to define the tag decl. In that
	// case, adjust the source location to the expansion loc so that we can
	// attach the comment to the tag decl.
	if (SourceMgr.isMacroArgExpansion(DeclLoc) &&
	TD->isCompleteDefinition())
	return SourceMgr.getExpansionLoc(DeclLoc);
	}
	}
	return DeclLoc;
	}

	return {};
	}

	RawComment *ASTContext::getRawCommentForDeclNoCacheImpl(
	const Decl *D, const SourceLocation RepresentativeLocForDecl,
	const std::map<unsigned, RawComment *> &CommentsInTheFile) const {
	// If the declaration doesn't map directly to a location in a file, we
	// can't find the comment.
	if (RepresentativeLocForDecl.isInvalid() \|\|
	!RepresentativeLocForDecl.isFileID())
	return nullptr;

	// If there are no comments anywhere, we won't find anything.
	if (CommentsInTheFile.empty())
	return nullptr;

	// Decompose the location for the declaration and find the beginning of the
	// file buffer.
	const std::pair<FileID, unsigned> DeclLocDecomp =
	SourceMgr.getDecomposedLoc(RepresentativeLocForDecl);

	// Slow path.
	auto OffsetCommentBehindDecl =
	CommentsInTheFile.lower_bound(DeclLocDecomp.second);

	// First check whether we have a trailing comment.
	if (OffsetCommentBehindDecl != CommentsInTheFile.end()) {
	RawComment *CommentBehindDecl = OffsetCommentBehindDecl->second;
	if ((CommentBehindDecl->isDocumentation() \|\|
	LangOpts.CommentOpts.ParseAllComments) &&
	CommentBehindDecl->isTrailingComment() &&
	(isa<FieldDecl>(D) \|\| isa<EnumConstantDecl>(D) \|\| isa<VarDecl>(D) \|\|
	isa<ObjCMethodDecl>(D) \|\| isa<ObjCPropertyDecl>(D))) {

	// Check that Doxygen trailing comment comes after the declaration, starts
	// on the same line and in the same file as the declaration.
	if (SourceMgr.getLineNumber(DeclLocDecomp.first, DeclLocDecomp.second) ==
	Comments.getCommentBeginLine(CommentBehindDecl, DeclLocDecomp.first,
	OffsetCommentBehindDecl->first)) {
	return CommentBehindDecl;
	}
	}
	}

	// The comment just after the declaration was not a trailing comment.
	// Let's look at the previous comment.
	if (OffsetCommentBehindDecl == CommentsInTheFile.begin())
	return nullptr;

	auto OffsetCommentBeforeDecl = --OffsetCommentBehindDecl;
	RawComment *CommentBeforeDecl = OffsetCommentBeforeDecl->second;

	// Check that we actually have a non-member Doxygen comment.
	if (!(CommentBeforeDecl->isDocumentation() \|\|
	LangOpts.CommentOpts.ParseAllComments) \|\|
	CommentBeforeDecl->isTrailingComment())
	return nullptr;

	// Decompose the end of the comment.
	const unsigned CommentEndOffset =
	Comments.getCommentEndOffset(CommentBeforeDecl);

	// Get the corresponding buffer.
	bool Invalid = false;
	const char *Buffer = SourceMgr.getBufferData(DeclLocDecomp.first,
	&Invalid).data();
	if (Invalid)
	return nullptr;

	// Extract text between the comment and declaration.
	StringRef Text(Buffer + CommentEndOffset,
	DeclLocDecomp.second - CommentEndOffset);

	// There should be no other declarations or preprocessor directives between
	// comment and declaration.
	if (Text.find_first_of(";{}#@") != StringRef::npos)
	return nullptr;

	return CommentBeforeDecl;
	}

	RawComment ASTContext::getRawCommentForDeclNoCache(const Decl D) const {
	const SourceLocation DeclLoc = getDeclLocForCommentSearch(D, SourceMgr);

	// If the declaration doesn't map directly to a location in a file, we
	// can't find the comment.
	if (DeclLoc.isInvalid() \|\| !DeclLoc.isFileID())
	return nullptr;

	if (ExternalSource && !CommentsLoaded) {
	ExternalSource->ReadComments();
	CommentsLoaded = true;
	}

	if (Comments.empty())
	return nullptr;

	const FileID File = SourceMgr.getDecomposedLoc(DeclLoc).first;
	const auto CommentsInThisFile = Comments.getCommentsInFile(File);
	if (!CommentsInThisFile \|\| CommentsInThisFile->empty())
	return nullptr;

	return getRawCommentForDeclNoCacheImpl(D, DeclLoc, *CommentsInThisFile);
	}

	void ASTContext::addComment(const RawComment &RC) {
	assert(LangOpts.RetainCommentsFromSystemHeaders \|\|
	!SourceMgr.isInSystemHeader(RC.getSourceRange().getBegin()));
	Comments.addComment(RC, LangOpts.CommentOpts, BumpAlloc);
	}

	/// If we have a 'templated' declaration for a template, adjust 'D' to
	/// refer to the actual template.
	/// If we have an implicit instantiation, adjust 'D' to refer to template.
	static const Decl &adjustDeclToTemplate(const Decl &D) {
	if (const auto *FD = dyn_cast<FunctionDecl>(&D)) {
	// Is this function declaration part of a function template?
	if (const FunctionTemplateDecl *FTD = FD->getDescribedFunctionTemplate())
	return *FTD;

	// Nothing to do if function is not an implicit instantiation.
	if (FD->getTemplateSpecializationKind() != TSK_ImplicitInstantiation)
	return D;

	// Function is an implicit instantiation of a function template?
	if (const FunctionTemplateDecl *FTD = FD->getPrimaryTemplate())
	return *FTD;

	// Function is instantiated from a member definition of a class template?
	if (const FunctionDecl *MemberDecl =
	FD->getInstantiatedFromMemberFunction())
	return *MemberDecl;

	return D;
	}
	if (const auto *VD = dyn_cast<VarDecl>(&D)) {
	// Static data member is instantiated from a member definition of a class
	// template?
	if (VD->isStaticDataMember())
	if (const VarDecl *MemberDecl = VD->getInstantiatedFromStaticDataMember())
	return *MemberDecl;

	return D;
	}
	if (const auto *CRD = dyn_cast<CXXRecordDecl>(&D)) {
	// Is this class declaration part of a class template?
	if (const ClassTemplateDecl *CTD = CRD->getDescribedClassTemplate())
	return *CTD;

	// Class is an implicit instantiation of a class template or partial
	// specialization?
	if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(CRD)) {
	if (CTSD->getSpecializationKind() != TSK_ImplicitInstantiation)
	return D;
	llvm::PointerUnion<ClassTemplateDecl *,
	ClassTemplatePartialSpecializationDecl *>
	PU = CTSD->getSpecializedTemplateOrPartial();
	return PU.is<ClassTemplateDecl *>()
	? static_cast<const Decl >(PU.get<ClassTemplateDecl *>())
	: static_cast<const Decl >(
	PU.get<ClassTemplatePartialSpecializationDecl *>());
	}

	// Class is instantiated from a member definition of a class template?
	if (const MemberSpecializationInfo *Info =
	CRD->getMemberSpecializationInfo())
	return *Info->getInstantiatedFrom();

	return D;
	}
	if (const auto *ED = dyn_cast<EnumDecl>(&D)) {
	// Enum is instantiated from a member definition of a class template?
	if (const EnumDecl *MemberDecl = ED->getInstantiatedFromMemberEnum())
	return *MemberDecl;

	return D;
	}
	// FIXME: Adjust alias templates?
	return D;
	}

	const RawComment *ASTContext::getRawCommentForAnyRedecl(
	const Decl *D,
	const Decl **OriginalDecl) const {
	if (!D) {
	if (OriginalDecl)
	OriginalDecl = nullptr;
	return nullptr;
	}

	D = &adjustDeclToTemplate(*D);

	// Any comment directly attached to D?
	{
	auto DeclComment = DeclRawComments.find(D);
	if (DeclComment != DeclRawComments.end()) {
	if (OriginalDecl)
	*OriginalDecl = D;
	return DeclComment->second;
	}
	}

	// Any comment attached to any redeclaration of D?
	const Decl *CanonicalD = D->getCanonicalDecl();
	if (!CanonicalD)
	return nullptr;

	{
	auto RedeclComment = RedeclChainComments.find(CanonicalD);
	if (RedeclComment != RedeclChainComments.end()) {
	if (OriginalDecl)
	*OriginalDecl = RedeclComment->second;
	auto CommentAtRedecl = DeclRawComments.find(RedeclComment->second);
	assert(CommentAtRedecl != DeclRawComments.end() &&
	"This decl is supposed to have comment attached.");
	return CommentAtRedecl->second;
	}
	}

	// Any redeclarations of D that we haven't checked for comments yet?
	// We can't use DenseMap::iterator directly since it'd get invalid.
	auto LastCheckedRedecl = [this, CanonicalD]() -> const Decl * {
	auto LookupRes = CommentlessRedeclChains.find(CanonicalD);
	if (LookupRes != CommentlessRedeclChains.end())
	return LookupRes->second;
	return nullptr;
	}();

	for (const auto Redecl : D->redecls()) {
	assert(Redecl);
	// Skip all redeclarations that have been checked previously.
	if (LastCheckedRedecl) {
	if (LastCheckedRedecl == Redecl) {
	LastCheckedRedecl = nullptr;
	}
	continue;
	}
	const RawComment *RedeclComment = getRawCommentForDeclNoCache(Redecl);
	if (RedeclComment) {
	cacheRawCommentForDecl(Redecl, RedeclComment);
	if (OriginalDecl)
	*OriginalDecl = Redecl;
	return RedeclComment;
	}
	CommentlessRedeclChains[CanonicalD] = Redecl;
	}

	if (OriginalDecl)
	*OriginalDecl = nullptr;
	return nullptr;
	}

	void ASTContext::cacheRawCommentForDecl(const Decl &OriginalD,
	const RawComment &Comment) const {
	assert(Comment.isDocumentation() \|\| LangOpts.CommentOpts.ParseAllComments);
	DeclRawComments.try_emplace(&OriginalD, &Comment);
	const Decl *const CanonicalDecl = OriginalD.getCanonicalDecl();
	RedeclChainComments.try_emplace(CanonicalDecl, &OriginalD);
	CommentlessRedeclChains.erase(CanonicalDecl);
	}

	static void addRedeclaredMethods(const ObjCMethodDecl *ObjCMethod,
	SmallVectorImpl<const NamedDecl *> &Redeclared) {
	const DeclContext *DC = ObjCMethod->getDeclContext();
	if (const auto *IMD = dyn_cast<ObjCImplDecl>(DC)) {
	const ObjCInterfaceDecl *ID = IMD->getClassInterface();
	if (!ID)
	return;
	// Add redeclared method here.
	for (const auto *Ext : ID->known_extensions()) {
	if (ObjCMethodDecl *RedeclaredMethod =
	Ext->getMethod(ObjCMethod->getSelector(),
	ObjCMethod->isInstanceMethod()))
	Redeclared.push_back(RedeclaredMethod);
	}
	}
	}

	void ASTContext::attachCommentsToJustParsedDecls(ArrayRef<Decl *> Decls,
	const Preprocessor *PP) {
	if (Comments.empty() \|\| Decls.empty())
	return;

	FileID File;
	for (Decl *D : Decls) {
	SourceLocation Loc = D->getLocation();
	if (Loc.isValid()) {
	// See if there are any new comments that are not attached to a decl.
	// The location doesn't have to be precise - we care only about the file.
	File = SourceMgr.getDecomposedLoc(Loc).first;
	break;
	}
	}

	if (File.isInvalid())
	return;

	auto CommentsInThisFile = Comments.getCommentsInFile(File);
	if (!CommentsInThisFile \|\| CommentsInThisFile->empty() \|\|
	CommentsInThisFile->rbegin()->second->isAttached())
	return;

	// There is at least one comment not attached to a decl.
	// Maybe it should be attached to one of Decls?
	//
	// Note that this way we pick up not only comments that precede the
	// declaration, but also comments that follow the declaration -- thanks to
	// the lookahead in the lexer: we've consumed the semicolon and looked
	// ahead through comments.

	for (const Decl *D : Decls) {
	assert(D);
	if (D->isInvalidDecl())
	continue;

	D = &adjustDeclToTemplate(*D);

	const SourceLocation DeclLoc = getDeclLocForCommentSearch(D, SourceMgr);

	if (DeclLoc.isInvalid() \|\| !DeclLoc.isFileID())
	continue;

	if (DeclRawComments.count(D) > 0)
	continue;

	if (RawComment *const DocComment =
	getRawCommentForDeclNoCacheImpl(D, DeclLoc, *CommentsInThisFile)) {
	cacheRawCommentForDecl(D, DocComment);
	comments::FullComment FC = DocComment->parse(this, PP, D);
	ParsedComments[D->getCanonicalDecl()] = FC;
	}
	}
	}

	comments::FullComment ASTContext::cloneFullComment(comments::FullComment FC,
	const Decl *D) const {
	auto ThisDeclInfo = new (this) comments::DeclInfo;
	ThisDeclInfo->CommentDecl = D;
	ThisDeclInfo->IsFilled = false;
	ThisDeclInfo->fill();
	ThisDeclInfo->CommentDecl = FC->getDecl();
	if (!ThisDeclInfo->TemplateParameters)
	ThisDeclInfo->TemplateParameters = FC->getDeclInfo()->TemplateParameters;
	comments::FullComment *CFC =
	new (*this) comments::FullComment(FC->getBlocks(),
	ThisDeclInfo);
	return CFC;
	}

	comments::FullComment ASTContext::getLocalCommentForDeclUncached(const Decl D) const {
	const RawComment *RC = getRawCommentForDeclNoCache(D);
	return RC ? RC->parse(*this, nullptr, D) : nullptr;
	}

	comments::FullComment *ASTContext::getCommentForDecl(
	const Decl *D,
	const Preprocessor *PP) const {
	if (!D \|\| D->isInvalidDecl())
	return nullptr;
	D = &adjustDeclToTemplate(*D);

	const Decl *Canonical = D->getCanonicalDecl();
	llvm::DenseMap<const Decl , comments::FullComment >::iterator Pos =
	ParsedComments.find(Canonical);

	if (Pos != ParsedComments.end()) {
	if (Canonical != D) {
	comments::FullComment *FC = Pos->second;
	comments::FullComment *CFC = cloneFullComment(FC, D);
	return CFC;
	}
	return Pos->second;
	}

	const Decl *OriginalDecl = nullptr;

	const RawComment *RC = getRawCommentForAnyRedecl(D, &OriginalDecl);
	if (!RC) {
	if (isa<ObjCMethodDecl>(D) \|\| isa<FunctionDecl>(D)) {
	SmallVector<const NamedDecl*, 8> Overridden;
	const auto *OMD = dyn_cast<ObjCMethodDecl>(D);
	if (OMD && OMD->isPropertyAccessor())
	if (const ObjCPropertyDecl *PDecl = OMD->findPropertyDecl())
	if (comments::FullComment *FC = getCommentForDecl(PDecl, PP))
	return cloneFullComment(FC, D);
	if (OMD)
	addRedeclaredMethods(OMD, Overridden);
	getOverriddenMethods(dyn_cast<NamedDecl>(D), Overridden);
	for (unsigned i = 0, e = Overridden.size(); i < e; i++)
	if (comments::FullComment *FC = getCommentForDecl(Overridden[i], PP))
	return cloneFullComment(FC, D);
	}
	else if (const auto *TD = dyn_cast<TypedefNameDecl>(D)) {
	// Attach any tag type's documentation to its typedef if latter
	// does not have one of its own.
	QualType QT = TD->getUnderlyingType();
	if (const auto *TT = QT->getAs<TagType>())
	if (const Decl *TD = TT->getDecl())
	if (comments::FullComment *FC = getCommentForDecl(TD, PP))
	return cloneFullComment(FC, D);
	}
	else if (const auto *IC = dyn_cast<ObjCInterfaceDecl>(D)) {
	while (IC->getSuperClass()) {
	IC = IC->getSuperClass();
	if (comments::FullComment *FC = getCommentForDecl(IC, PP))
	return cloneFullComment(FC, D);
	}
	}
	else if (const auto *CD = dyn_cast<ObjCCategoryDecl>(D)) {
	if (const ObjCInterfaceDecl *IC = CD->getClassInterface())
	if (comments::FullComment *FC = getCommentForDecl(IC, PP))
	return cloneFullComment(FC, D);
	}
	else if (const auto *RD = dyn_cast<CXXRecordDecl>(D)) {
	if (!(RD = RD->getDefinition()))
	return nullptr;
	// Check non-virtual bases.
	for (const auto &I : RD->bases()) {
	if (I.isVirtual() \|\| (I.getAccessSpecifier() != AS_public))
	continue;
	QualType Ty = I.getType();
	if (Ty.isNull())
	continue;
	if (const CXXRecordDecl *NonVirtualBase = Ty->getAsCXXRecordDecl()) {
	if (!(NonVirtualBase= NonVirtualBase->getDefinition()))
	continue;

	if (comments::FullComment *FC = getCommentForDecl((NonVirtualBase), PP))
	return cloneFullComment(FC, D);
	}
	}
	// Check virtual bases.
	for (const auto &I : RD->vbases()) {
	if (I.getAccessSpecifier() != AS_public)
	continue;
	QualType Ty = I.getType();
	if (Ty.isNull())
	continue;
	if (const CXXRecordDecl *VirtualBase = Ty->getAsCXXRecordDecl()) {
	if (!(VirtualBase= VirtualBase->getDefinition()))
	continue;
	if (comments::FullComment *FC = getCommentForDecl((VirtualBase), PP))
	return cloneFullComment(FC, D);
	}
	}
	}
	return nullptr;
	}

	// If the RawComment was attached to other redeclaration of this Decl, we
	// should parse the comment in context of that other Decl. This is important
	// because comments can contain references to parameter names which can be
	// different across redeclarations.
	if (D != OriginalDecl && OriginalDecl)
	return getCommentForDecl(OriginalDecl, PP);

	comments::FullComment FC = RC->parse(this, PP, D);
	ParsedComments[Canonical] = FC;
	return FC;
	}

	void
	ASTContext::CanonicalTemplateTemplateParm::Profile(llvm::FoldingSetNodeID &ID,
	const ASTContext &C,
	TemplateTemplateParmDecl *Parm) {
	ID.AddInteger(Parm->getDepth());
	ID.AddInteger(Parm->getPosition());
	ID.AddBoolean(Parm->isParameterPack());

	TemplateParameterList *Params = Parm->getTemplateParameters();
	ID.AddInteger(Params->size());
	for (TemplateParameterList::const_iterator P = Params->begin(),
	PEnd = Params->end();
	P != PEnd; ++P) {
	if (const auto TTP = dyn_cast<TemplateTypeParmDecl>(P)) {
	ID.AddInteger(0);
	ID.AddBoolean(TTP->isParameterPack());
	const TypeConstraint *TC = TTP->getTypeConstraint();
	ID.AddBoolean(TC != nullptr);
	if (TC)
	TC->getImmediatelyDeclaredConstraint()->Profile(ID, C,
	/Canonical=/true);
	if (TTP->isExpandedParameterPack()) {
	ID.AddBoolean(true);
	ID.AddInteger(TTP->getNumExpansionParameters());
	} else
	ID.AddBoolean(false);
	continue;
	}

	if (const auto NTTP = dyn_cast<NonTypeTemplateParmDecl>(P)) {
	ID.AddInteger(1);
	ID.AddBoolean(NTTP->isParameterPack());
	ID.AddPointer(NTTP->getType().getCanonicalType().getAsOpaquePtr());
	if (NTTP->isExpandedParameterPack()) {
	ID.AddBoolean(true);
	ID.AddInteger(NTTP->getNumExpansionTypes());
	for (unsigned I = 0, N = NTTP->getNumExpansionTypes(); I != N; ++I) {
	QualType T = NTTP->getExpansionType(I);
	ID.AddPointer(T.getCanonicalType().getAsOpaquePtr());
	}
	} else
	ID.AddBoolean(false);
	continue;
	}

	auto TTP = cast<TemplateTemplateParmDecl>(P);
	ID.AddInteger(2);
	Profile(ID, C, TTP);
	}
	Expr *RequiresClause = Parm->getTemplateParameters()->getRequiresClause();
	ID.AddBoolean(RequiresClause != nullptr);
	if (RequiresClause)
	RequiresClause->Profile(ID, C, /Canonical=/true);
	}

	static Expr *
	canonicalizeImmediatelyDeclaredConstraint(const ASTContext &C, Expr *IDC,
	QualType ConstrainedType) {
	// This is a bit ugly - we need to form a new immediately-declared
	// constraint that references the new parameter; this would ideally
	// require semantic analysis (e.g. template<C T> struct S {}; - the
	// converted arguments of C<T> could be an argument pack if C is
	// declared as template<typename... T> concept C = ...).
	// We don't have semantic analysis here so we dig deep into the
	// ready-made constraint expr and change the thing manually.
	ConceptSpecializationExpr *CSE;
	if (const auto *Fold = dyn_cast<CXXFoldExpr>(IDC))
	CSE = cast<ConceptSpecializationExpr>(Fold->getLHS());
	else
	CSE = cast<ConceptSpecializationExpr>(IDC);
	ArrayRef<TemplateArgument> OldConverted = CSE->getTemplateArguments();
	SmallVector<TemplateArgument, 3> NewConverted;
	NewConverted.reserve(OldConverted.size());
	if (OldConverted.front().getKind() == TemplateArgument::Pack) {
	// The case:
	// template<typename... T> concept C = true;
	// template<C<int> T> struct S; -> constraint is C<{T, int}>
	NewConverted.push_back(ConstrainedType);
	for (auto &Arg : OldConverted.front().pack_elements().drop_front(1))
	NewConverted.push_back(Arg);
	TemplateArgument NewPack(NewConverted);

	NewConverted.clear();
	NewConverted.push_back(NewPack);
	assert(OldConverted.size() == 1 &&
	"Template parameter pack should be the last parameter");
	} else {
	assert(OldConverted.front().getKind() == TemplateArgument::Type &&
	"Unexpected first argument kind for immediately-declared "
	"constraint");
	NewConverted.push_back(ConstrainedType);
	for (auto &Arg : OldConverted.drop_front(1))
	NewConverted.push_back(Arg);
	}
	Expr *NewIDC = ConceptSpecializationExpr::Create(
	C, CSE->getNamedConcept(), NewConverted, nullptr,
	CSE->isInstantiationDependent(), CSE->containsUnexpandedParameterPack());

	if (auto *OrigFold = dyn_cast<CXXFoldExpr>(IDC))
	NewIDC = new (C) CXXFoldExpr(OrigFold->getType(), SourceLocation(), NewIDC,
	BinaryOperatorKind::BO_LAnd,
	SourceLocation(), /RHS=/nullptr,
	SourceLocation(), /NumExpansions=/None);
	return NewIDC;
	}

	TemplateTemplateParmDecl *
	ASTContext::getCanonicalTemplateTemplateParmDecl(
	TemplateTemplateParmDecl *TTP) const {
	// Check if we already have a canonical template template parameter.
	llvm::FoldingSetNodeID ID;
	CanonicalTemplateTemplateParm::Profile(ID, *this, TTP);
	void *InsertPos = nullptr;
	CanonicalTemplateTemplateParm *Canonical
	= CanonTemplateTemplateParms.FindNodeOrInsertPos(ID, InsertPos);
	if (Canonical)
	return Canonical->getParam();

	// Build a canonical template parameter list.
	TemplateParameterList *Params = TTP->getTemplateParameters();
	SmallVector<NamedDecl *, 4> CanonParams;
	CanonParams.reserve(Params->size());
	for (TemplateParameterList::const_iterator P = Params->begin(),
	PEnd = Params->end();
	P != PEnd; ++P) {
	if (const auto TTP = dyn_cast<TemplateTypeParmDecl>(P)) {
	TemplateTypeParmDecl NewTTP = TemplateTypeParmDecl::Create(this,
	getTranslationUnitDecl(), SourceLocation(), SourceLocation(),
	TTP->getDepth(), TTP->getIndex(), nullptr, false,
	TTP->isParameterPack(), TTP->hasTypeConstraint(),
	TTP->isExpandedParameterPack() ?
	llvm::Optional<unsigned>(TTP->getNumExpansionParameters()) : None);
	if (const auto *TC = TTP->getTypeConstraint()) {
	QualType ParamAsArgument(NewTTP->getTypeForDecl(), 0);
	Expr *NewIDC = canonicalizeImmediatelyDeclaredConstraint(
	*this, TC->getImmediatelyDeclaredConstraint(),
	ParamAsArgument);
	TemplateArgumentListInfo CanonArgsAsWritten;
	if (auto *Args = TC->getTemplateArgsAsWritten())
	for (const auto &ArgLoc : Args->arguments())
	CanonArgsAsWritten.addArgument(
	TemplateArgumentLoc(ArgLoc.getArgument(),
	TemplateArgumentLocInfo()));
	NewTTP->setTypeConstraint(
	NestedNameSpecifierLoc(),
	DeclarationNameInfo(TC->getNamedConcept()->getDeclName(),
	SourceLocation()), /FoundDecl=/nullptr,
	// Actually canonicalizing a TemplateArgumentLoc is difficult so we
	// simply omit the ArgsAsWritten
	TC->getNamedConcept(), /ArgsAsWritten=/nullptr, NewIDC);
	}
	CanonParams.push_back(NewTTP);
	} else if (const auto NTTP = dyn_cast<NonTypeTemplateParmDecl>(P)) {
	QualType T = getCanonicalType(NTTP->getType());
	TypeSourceInfo *TInfo = getTrivialTypeSourceInfo(T);
	NonTypeTemplateParmDecl *Param;
	if (NTTP->isExpandedParameterPack()) {
	SmallVector<QualType, 2> ExpandedTypes;
	SmallVector<TypeSourceInfo *, 2> ExpandedTInfos;
	for (unsigned I = 0, N = NTTP->getNumExpansionTypes(); I != N; ++I) {
	ExpandedTypes.push_back(getCanonicalType(NTTP->getExpansionType(I)));
	ExpandedTInfos.push_back(
	getTrivialTypeSourceInfo(ExpandedTypes.back()));
	}

	Param = NonTypeTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	SourceLocation(),
	NTTP->getDepth(),
	NTTP->getPosition(), nullptr,
	T,
	TInfo,
	ExpandedTypes,
	ExpandedTInfos);
	} else {
	Param = NonTypeTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	SourceLocation(),
	NTTP->getDepth(),
	NTTP->getPosition(), nullptr,
	T,
	NTTP->isParameterPack(),
	TInfo);
	}
	if (AutoType *AT = T->getContainedAutoType()) {
	if (AT->isConstrained()) {
	Param->setPlaceholderTypeConstraint(
	canonicalizeImmediatelyDeclaredConstraint(
	*this, NTTP->getPlaceholderTypeConstraint(), T));
	}
	}
	CanonParams.push_back(Param);

	} else
	CanonParams.push_back(getCanonicalTemplateTemplateParmDecl(
	cast<TemplateTemplateParmDecl>(*P)));
	}

	Expr *CanonRequiresClause = nullptr;
	if (Expr *RequiresClause = TTP->getTemplateParameters()->getRequiresClause())
	CanonRequiresClause = RequiresClause;

	TemplateTemplateParmDecl *CanonTTP
	= TemplateTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(), TTP->getDepth(),
	TTP->getPosition(),
	TTP->isParameterPack(),
	nullptr,
	TemplateParameterList::Create(*this, SourceLocation(),
	SourceLocation(),
	CanonParams,
	SourceLocation(),
	CanonRequiresClause));

	// Get the new insert position for the node we care about.
	Canonical = CanonTemplateTemplateParms.FindNodeOrInsertPos(ID, InsertPos);
	assert(!Canonical && "Shouldn't be in the map!");
	(void)Canonical;

	// Create the canonical template template parameter entry.
	Canonical = new (*this) CanonicalTemplateTemplateParm(CanonTTP);
	CanonTemplateTemplateParms.InsertNode(Canonical, InsertPos);
	return CanonTTP;
	}

	CXXABI *ASTContext::createCXXABI(const TargetInfo &T) {
	if (!LangOpts.CPlusPlus) return nullptr;

	switch (T.getCXXABI().getKind()) {
	case TargetCXXABI::Fuchsia:
	case TargetCXXABI::GenericARM: // Same as Itanium at this level
	case TargetCXXABI::iOS:
	case TargetCXXABI::iOS64:
	case TargetCXXABI::WatchOS:
	case TargetCXXABI::GenericAArch64:
	case TargetCXXABI::GenericMIPS:
	case TargetCXXABI::GenericItanium:
	case TargetCXXABI::WebAssembly:
	case TargetCXXABI::XL:
	return CreateItaniumCXXABI(*this);
	case TargetCXXABI::Microsoft:
	return CreateMicrosoftCXXABI(*this);
	}
	llvm_unreachable("Invalid CXXABI type!");
	}

	interp::Context &ASTContext::getInterpContext() {
	if (!InterpContext) {
	InterpContext.reset(new interp::Context(*this));
	}
	return *InterpContext.get();
	}

	ParentMapContext &ASTContext::getParentMapContext() {
	if (!ParentMapCtx)
	ParentMapCtx.reset(new ParentMapContext(*this));
	return *ParentMapCtx.get();
	}

	static const LangASMap *getAddressSpaceMap(const TargetInfo &T,
	const LangOptions &LOpts) {
	if (LOpts.FakeAddressSpaceMap) {
	// The fake address space map must have a distinct entry for each
	// language-specific address space.
	static const unsigned FakeAddrSpaceMap[] = {
	0, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	0, // opencl_private
	4, // opencl_generic
	5, // cuda_device
	6, // cuda_constant
	7, // cuda_shared
	8, // ptr32_sptr
	9, // ptr32_uptr
	10 // ptr64
	};
	return &FakeAddrSpaceMap;
	} else {
	return &T.getAddressSpaceMap();
	}
	}

	static bool isAddrSpaceMapManglingEnabled(const TargetInfo &TI,
	const LangOptions &LangOpts) {
	switch (LangOpts.getAddressSpaceMapMangling()) {
	case LangOptions::ASMM_Target:
	return TI.useAddressSpaceMapMangling();
	case LangOptions::ASMM_On:
	return true;
	case LangOptions::ASMM_Off:
	return false;
	}
	llvm_unreachable("getAddressSpaceMapMangling() doesn't cover anything.");
	}

	ASTContext::ASTContext(LangOptions &LOpts, SourceManager &SM,
	IdentifierTable &idents, SelectorTable &sels,
	Builtin::Context &builtins)
	: ConstantArrayTypes(this_()), FunctionProtoTypes(this_()),
	TemplateSpecializationTypes(this_()),
	DependentTemplateSpecializationTypes(this_()), AutoTypes(this_()),
	SubstTemplateTemplateParmPacks(this_()),
	CanonTemplateTemplateParms(this_()), SourceMgr(SM), LangOpts(LOpts),
	SanitizerBL(new SanitizerBlacklist(LangOpts.SanitizerBlacklistFiles, SM)),
	XRayFilter(new XRayFunctionFilter(LangOpts.XRayAlwaysInstrumentFiles,
	LangOpts.XRayNeverInstrumentFiles,
	LangOpts.XRayAttrListFiles, SM)),
	PrintingPolicy(LOpts), Idents(idents), Selectors(sels),
	BuiltinInfo(builtins), DeclarationNames(*this), Comments(SM),
	CommentCommandTraits(BumpAlloc, LOpts.CommentOpts),
	CompCategories(this_()), LastSDM(nullptr, 0) {
	TUDecl = TranslationUnitDecl::Create(*this);
	TraversalScope = {TUDecl};
	}

	ASTContext::~ASTContext() {
	// Release the DenseMaps associated with DeclContext objects.
	// FIXME: Is this the ideal solution?
	ReleaseDeclContextMaps();

	// Call all of the deallocation functions on all of their targets.
	for (auto &Pair : Deallocations)
	(Pair.first)(Pair.second);

	// ASTRecordLayout objects in ASTRecordLayouts must always be destroyed
	// because they can contain DenseMaps.
	for (llvm::DenseMap<const ObjCContainerDecl*,
	const ASTRecordLayout*>::iterator
	I = ObjCLayouts.begin(), E = ObjCLayouts.end(); I != E; )
	// Increment in loop to prevent using deallocated memory.
	if (auto R = const_cast<ASTRecordLayout >((I++)->second))
	R->Destroy(*this);

	for (llvm::DenseMap<const RecordDecl, const ASTRecordLayout>::iterator
	I = ASTRecordLayouts.begin(), E = ASTRecordLayouts.end(); I != E; ) {
	// Increment in loop to prevent using deallocated memory.
	if (auto R = const_cast<ASTRecordLayout >((I++)->second))
	R->Destroy(*this);
	}

	for (llvm::DenseMap<const Decl, AttrVec>::iterator A = DeclAttrs.begin(),
	AEnd = DeclAttrs.end();
	A != AEnd; ++A)
	A->second->~AttrVec();

	for (const auto &Value : ModuleInitializers)
	Value.second->~PerModuleInitializers();

	for (APValue *Value : APValueCleanups)
	Value->~APValue();
	}

	void ASTContext::setTraversalScope(const std::vector<Decl *> &TopLevelDecls) {
	TraversalScope = TopLevelDecls;
	getParentMapContext().clear();
	}

	void ASTContext::AddDeallocation(void (Callback)(void ), void *Data) const {
	Deallocations.push_back({Callback, Data});
	}

	void
	ASTContext::setExternalSource(IntrusiveRefCntPtr<ExternalASTSource> Source) {
	ExternalSource = std::move(Source);
	}

	void ASTContext::PrintStats() const {
	llvm::errs() << "\n*** AST Context Stats:\n";
	llvm::errs() << " " << Types.size() << " types total.\n";

	unsigned counts[] = {
	#define TYPE(Name, Parent) 0,
	#define ABSTRACT_TYPE(Name, Parent)
	#include "clang/AST/TypeNodes.inc"
	0 // Extra
	};

	for (unsigned i = 0, e = Types.size(); i != e; ++i) {
	Type *T = Types[i];
	counts[(unsigned)T->getTypeClass()]++;
	}

	unsigned Idx = 0;
	unsigned TotalBytes = 0;
	#define TYPE(Name, Parent) \
	if (counts[Idx]) \
	llvm::errs() << " " << counts[Idx] << " " << #Name \
	<< " types, " << sizeof(Name##Type) << " each " \
	<< "(" << counts[Idx] * sizeof(Name##Type) \
	<< " bytes)\n"; \
	TotalBytes += counts[Idx] * sizeof(Name##Type); \
	++Idx;
	#define ABSTRACT_TYPE(Name, Parent)
	#include "clang/AST/TypeNodes.inc"

	llvm::errs() << "Total bytes = " << TotalBytes << "\n";

	// Implicit special member functions.
	llvm::errs() << NumImplicitDefaultConstructorsDeclared << "/"
	<< NumImplicitDefaultConstructors
	<< " implicit default constructors created\n";
	llvm::errs() << NumImplicitCopyConstructorsDeclared << "/"
	<< NumImplicitCopyConstructors
	<< " implicit copy constructors created\n";
	if (getLangOpts().CPlusPlus)
	llvm::errs() << NumImplicitMoveConstructorsDeclared << "/"
	<< NumImplicitMoveConstructors
	<< " implicit move constructors created\n";
	llvm::errs() << NumImplicitCopyAssignmentOperatorsDeclared << "/"
	<< NumImplicitCopyAssignmentOperators
	<< " implicit copy assignment operators created\n";
	if (getLangOpts().CPlusPlus)
	llvm::errs() << NumImplicitMoveAssignmentOperatorsDeclared << "/"
	<< NumImplicitMoveAssignmentOperators
	<< " implicit move assignment operators created\n";
	llvm::errs() << NumImplicitDestructorsDeclared << "/"
	<< NumImplicitDestructors
	<< " implicit destructors created\n";

	if (ExternalSource) {
	llvm::errs() << "\n";
	ExternalSource->PrintStats();
	}

	BumpAlloc.PrintStats();
	}

	void ASTContext::mergeDefinitionIntoModule(NamedDecl ND, Module M,
	bool NotifyListeners) {
	if (NotifyListeners)
	if (auto *Listener = getASTMutationListener())
	Listener->RedefinedHiddenDefinition(ND, M);

	MergedDefModules[cast<NamedDecl>(ND->getCanonicalDecl())].push_back(M);
	}

	void ASTContext::deduplicateMergedDefinitonsFor(NamedDecl *ND) {
	auto It = MergedDefModules.find(cast<NamedDecl>(ND->getCanonicalDecl()));
	if (It == MergedDefModules.end())
	return;

	auto &Merged = It->second;
	llvm::DenseSet<Module*> Found;
	for (Module *&M : Merged)
	if (!Found.insert(M).second)
	M = nullptr;
	Merged.erase(std::remove(Merged.begin(), Merged.end(), nullptr), Merged.end());
	}

	ArrayRef<Module *>
	ASTContext::getModulesWithMergedDefinition(const NamedDecl *Def) {
	auto MergedIt =
	MergedDefModules.find(cast<NamedDecl>(Def->getCanonicalDecl()));
	if (MergedIt == MergedDefModules.end())
	return None;
	return MergedIt->second;
	}

	void ASTContext::PerModuleInitializers::resolve(ASTContext &Ctx) {
	if (LazyInitializers.empty())
	return;

	auto *Source = Ctx.getExternalSource();
	assert(Source && "lazy initializers but no external source");

	auto LazyInits = std::move(LazyInitializers);
	LazyInitializers.clear();

	for (auto ID : LazyInits)
	Initializers.push_back(Source->GetExternalDecl(ID));

	assert(LazyInitializers.empty() &&
	"GetExternalDecl for lazy module initializer added more inits");
	}

	void ASTContext::addModuleInitializer(Module M, Decl D) {
	// One special case: if we add a module initializer that imports another
	// module, and that module's only initializer is an ImportDecl, simplify.
	if (const auto *ID = dyn_cast<ImportDecl>(D)) {
	auto It = ModuleInitializers.find(ID->getImportedModule());

	// Maybe the ImportDecl does nothing at all. (Common case.)
	if (It == ModuleInitializers.end())
	return;

	// Maybe the ImportDecl only imports another ImportDecl.
	auto &Imported = *It->second;
	if (Imported.Initializers.size() + Imported.LazyInitializers.size() == 1) {
	Imported.resolve(*this);
	auto *OnlyDecl = Imported.Initializers.front();
	if (isa<ImportDecl>(OnlyDecl))
	D = OnlyDecl;
	}
	}

	auto *&Inits = ModuleInitializers[M];
	if (!Inits)
	Inits = new (*this) PerModuleInitializers;
	Inits->Initializers.push_back(D);
	}

	void ASTContext::addLazyModuleInitializers(Module *M, ArrayRef<uint32_t> IDs) {
	auto *&Inits = ModuleInitializers[M];
	if (!Inits)
	Inits = new (*this) PerModuleInitializers;
	Inits->LazyInitializers.insert(Inits->LazyInitializers.end(),
	IDs.begin(), IDs.end());
	}

	ArrayRef<Decl > ASTContext::getModuleInitializers(Module M) {
	auto It = ModuleInitializers.find(M);
	if (It == ModuleInitializers.end())
	return None;

	auto *Inits = It->second;
	Inits->resolve(*this);
	return Inits->Initializers;
	}

	ExternCContextDecl *ASTContext::getExternCContextDecl() const {
	if (!ExternCContext)
	ExternCContext = ExternCContextDecl::Create(*this, getTranslationUnitDecl());

	return ExternCContext;
	}

	BuiltinTemplateDecl *
	ASTContext::buildBuiltinTemplateDecl(BuiltinTemplateKind BTK,
	const IdentifierInfo *II) const {
	auto BuiltinTemplate = BuiltinTemplateDecl::Create(this, TUDecl, II, BTK);
	BuiltinTemplate->setImplicit();
	TUDecl->addDecl(BuiltinTemplate);

	return BuiltinTemplate;
	}

	BuiltinTemplateDecl *
	ASTContext::getMakeIntegerSeqDecl() const {
	if (!MakeIntegerSeqDecl)
	MakeIntegerSeqDecl = buildBuiltinTemplateDecl(BTK__make_integer_seq,
	getMakeIntegerSeqName());
	return MakeIntegerSeqDecl;
	}

	BuiltinTemplateDecl *
	ASTContext::getTypePackElementDecl() const {
	if (!TypePackElementDecl)
	TypePackElementDecl = buildBuiltinTemplateDecl(BTK__type_pack_element,
	getTypePackElementName());
	return TypePackElementDecl;
	}

	RecordDecl *ASTContext::buildImplicitRecord(StringRef Name,
	RecordDecl::TagKind TK) const {
	SourceLocation Loc;
	RecordDecl *NewDecl;
	if (getLangOpts().CPlusPlus)
	NewDecl = CXXRecordDecl::Create(*this, TK, getTranslationUnitDecl(), Loc,
	Loc, &Idents.get(Name));
	else
	NewDecl = RecordDecl::Create(*this, TK, getTranslationUnitDecl(), Loc, Loc,
	&Idents.get(Name));
	NewDecl->setImplicit();
	NewDecl->addAttr(TypeVisibilityAttr::CreateImplicit(
	const_cast<ASTContext &>(*this), TypeVisibilityAttr::Default));
	return NewDecl;
	}

	TypedefDecl *ASTContext::buildImplicitTypedef(QualType T,
	StringRef Name) const {
	TypeSourceInfo *TInfo = getTrivialTypeSourceInfo(T);
	TypedefDecl *NewDecl = TypedefDecl::Create(
	const_cast<ASTContext &>(*this), getTranslationUnitDecl(),
	SourceLocation(), SourceLocation(), &Idents.get(Name), TInfo);
	NewDecl->setImplicit();
	return NewDecl;
	}

	TypedefDecl *ASTContext::getInt128Decl() const {
	if (!Int128Decl)
	Int128Decl = buildImplicitTypedef(Int128Ty, "__int128_t");
	return Int128Decl;
	}

	TypedefDecl *ASTContext::getUInt128Decl() const {
	if (!UInt128Decl)
	UInt128Decl = buildImplicitTypedef(UnsignedInt128Ty, "__uint128_t");
	return UInt128Decl;
	}

	void ASTContext::InitBuiltinType(CanQualType &R, BuiltinType::Kind K) {
	auto Ty = new (this, TypeAlignment) BuiltinType(K);
	R = CanQualType::CreateUnsafe(QualType(Ty, 0));
	Types.push_back(Ty);
	}

	void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
	const TargetInfo *AuxTarget) {
	assert((!this->Target \|\| this->Target == &Target) &&
	"Incorrect target reinitialization");
	assert(VoidTy.isNull() && "Context reinitialized?");

	this->Target = &Target;
	this->AuxTarget = AuxTarget;

	ABI.reset(createCXXABI(Target));
	AddrSpaceMap = getAddressSpaceMap(Target, LangOpts);
	AddrSpaceMapMangling = isAddrSpaceMapManglingEnabled(Target, LangOpts);

	// C99 6.2.5p19.
	InitBuiltinType(VoidTy, BuiltinType::Void);

	// C99 6.2.5p2.
	InitBuiltinType(BoolTy, BuiltinType::Bool);
	// C99 6.2.5p3.
	if (LangOpts.CharIsSigned)
	InitBuiltinType(CharTy, BuiltinType::Char_S);
	else
	InitBuiltinType(CharTy, BuiltinType::Char_U);
	// C99 6.2.5p4.
	InitBuiltinType(SignedCharTy, BuiltinType::SChar);
	InitBuiltinType(ShortTy, BuiltinType::Short);
	InitBuiltinType(IntTy, BuiltinType::Int);
	InitBuiltinType(LongTy, BuiltinType::Long);
	InitBuiltinType(LongLongTy, BuiltinType::LongLong);

	// C99 6.2.5p6.
	InitBuiltinType(UnsignedCharTy, BuiltinType::UChar);
	InitBuiltinType(UnsignedShortTy, BuiltinType::UShort);
	InitBuiltinType(UnsignedIntTy, BuiltinType::UInt);
	InitBuiltinType(UnsignedLongTy, BuiltinType::ULong);
	InitBuiltinType(UnsignedLongLongTy, BuiltinType::ULongLong);

	// C99 6.2.5p10.
	InitBuiltinType(FloatTy, BuiltinType::Float);
	InitBuiltinType(DoubleTy, BuiltinType::Double);
	InitBuiltinType(LongDoubleTy, BuiltinType::LongDouble);

	// GNU extension, __float128 for IEEE quadruple precision
	InitBuiltinType(Float128Ty, BuiltinType::Float128);

	// C11 extension ISO/IEC TS 18661-3
	InitBuiltinType(Float16Ty, BuiltinType::Float16);

	// ISO/IEC JTC1 SC22 WG14 N1169 Extension
	InitBuiltinType(ShortAccumTy, BuiltinType::ShortAccum);
	InitBuiltinType(AccumTy, BuiltinType::Accum);
	InitBuiltinType(LongAccumTy, BuiltinType::LongAccum);
	InitBuiltinType(UnsignedShortAccumTy, BuiltinType::UShortAccum);
	InitBuiltinType(UnsignedAccumTy, BuiltinType::UAccum);
	InitBuiltinType(UnsignedLongAccumTy, BuiltinType::ULongAccum);
	InitBuiltinType(ShortFractTy, BuiltinType::ShortFract);
	InitBuiltinType(FractTy, BuiltinType::Fract);
	InitBuiltinType(LongFractTy, BuiltinType::LongFract);
	InitBuiltinType(UnsignedShortFractTy, BuiltinType::UShortFract);
	InitBuiltinType(UnsignedFractTy, BuiltinType::UFract);
	InitBuiltinType(UnsignedLongFractTy, BuiltinType::ULongFract);
	InitBuiltinType(SatShortAccumTy, BuiltinType::SatShortAccum);
	InitBuiltinType(SatAccumTy, BuiltinType::SatAccum);
	InitBuiltinType(SatLongAccumTy, BuiltinType::SatLongAccum);
	InitBuiltinType(SatUnsignedShortAccumTy, BuiltinType::SatUShortAccum);
	InitBuiltinType(SatUnsignedAccumTy, BuiltinType::SatUAccum);
	InitBuiltinType(SatUnsignedLongAccumTy, BuiltinType::SatULongAccum);
	InitBuiltinType(SatShortFractTy, BuiltinType::SatShortFract);
	InitBuiltinType(SatFractTy, BuiltinType::SatFract);
	InitBuiltinType(SatLongFractTy, BuiltinType::SatLongFract);
	InitBuiltinType(SatUnsignedShortFractTy, BuiltinType::SatUShortFract);
	InitBuiltinType(SatUnsignedFractTy, BuiltinType::SatUFract);
	InitBuiltinType(SatUnsignedLongFractTy, BuiltinType::SatULongFract);

	// GNU extension, 128-bit integers.
	InitBuiltinType(Int128Ty, BuiltinType::Int128);
	InitBuiltinType(UnsignedInt128Ty, BuiltinType::UInt128);

	// C++ 3.9.1p5
	if (TargetInfo::isTypeSigned(Target.getWCharType()))
	InitBuiltinType(WCharTy, BuiltinType::WChar_S);
	else // -fshort-wchar makes wchar_t be unsigned.
	InitBuiltinType(WCharTy, BuiltinType::WChar_U);
	if (LangOpts.CPlusPlus && LangOpts.WChar)
	WideCharTy = WCharTy;
	else {
	// C99 (or C++ using -fno-wchar).
	WideCharTy = getFromTargetType(Target.getWCharType());
	}

	WIntTy = getFromTargetType(Target.getWIntType());

	// C++20 (proposed)
	InitBuiltinType(Char8Ty, BuiltinType::Char8);

	if (LangOpts.CPlusPlus) // C++0x 3.9.1p5, extension for C++
	InitBuiltinType(Char16Ty, BuiltinType::Char16);
	else // C99
	Char16Ty = getFromTargetType(Target.getChar16Type());

	if (LangOpts.CPlusPlus) // C++0x 3.9.1p5, extension for C++
	InitBuiltinType(Char32Ty, BuiltinType::Char32);
	else // C99
	Char32Ty = getFromTargetType(Target.getChar32Type());

	// Placeholder type for type-dependent expressions whose type is
	// completely unknown. No code should ever check a type against
	// DependentTy and users should never see it; however, it is here to
	// help diagnose failures to properly check for type-dependent
	// expressions.
	InitBuiltinType(DependentTy, BuiltinType::Dependent);

	// Placeholder type for functions.
	InitBuiltinType(OverloadTy, BuiltinType::Overload);

	// Placeholder type for bound members.
	InitBuiltinType(BoundMemberTy, BuiltinType::BoundMember);

	// Placeholder type for pseudo-objects.
	InitBuiltinType(PseudoObjectTy, BuiltinType::PseudoObject);

	// "any" type; useful for debugger-like clients.
	InitBuiltinType(UnknownAnyTy, BuiltinType::UnknownAny);

	// Placeholder type for unbridged ARC casts.
	InitBuiltinType(ARCUnbridgedCastTy, BuiltinType::ARCUnbridgedCast);

	// Placeholder type for builtin functions.
	InitBuiltinType(BuiltinFnTy, BuiltinType::BuiltinFn);

	// Placeholder type for OMP array sections.
	if (LangOpts.OpenMP) {
	InitBuiltinType(OMPArraySectionTy, BuiltinType::OMPArraySection);
	InitBuiltinType(OMPArrayShapingTy, BuiltinType::OMPArrayShaping);
	InitBuiltinType(OMPIteratorTy, BuiltinType::OMPIterator);
	}
	if (LangOpts.MatrixTypes)
	InitBuiltinType(IncompleteMatrixIdxTy, BuiltinType::IncompleteMatrixIdx);

	// C99 6.2.5p11.
	FloatComplexTy = getComplexType(FloatTy);
	DoubleComplexTy = getComplexType(DoubleTy);
	LongDoubleComplexTy = getComplexType(LongDoubleTy);
	Float128ComplexTy = getComplexType(Float128Ty);

	// Builtin types for 'id', 'Class', and 'SEL'.
	InitBuiltinType(ObjCBuiltinIdTy, BuiltinType::ObjCId);
	InitBuiltinType(ObjCBuiltinClassTy, BuiltinType::ObjCClass);
	InitBuiltinType(ObjCBuiltinSelTy, BuiltinType::ObjCSel);

	if (LangOpts.OpenCL) {
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	InitBuiltinType(SingletonId, BuiltinType::Id);
	#include "clang/Basic/OpenCLImageTypes.def"

	InitBuiltinType(OCLSamplerTy, BuiltinType::OCLSampler);
	InitBuiltinType(OCLEventTy, BuiltinType::OCLEvent);
	InitBuiltinType(OCLClkEventTy, BuiltinType::OCLClkEvent);
	InitBuiltinType(OCLQueueTy, BuiltinType::OCLQueue);
	InitBuiltinType(OCLReserveIDTy, BuiltinType::OCLReserveID);

	#define EXT_OPAQUE_TYPE(ExtType, Id, Ext) \
	InitBuiltinType(Id##Ty, BuiltinType::Id);
	#include "clang/Basic/OpenCLExtensionTypes.def"
	}

	if (Target.hasAArch64SVETypes()) {
	#define SVE_TYPE(Name, Id, SingletonId) \
	InitBuiltinType(SingletonId, BuiltinType::Id);
	#include "clang/Basic/AArch64SVEACLETypes.def"
	}

	// Builtin type for __objc_yes and __objc_no
	ObjCBuiltinBoolTy = (Target.useSignedCharForObjCBool() ?
	SignedCharTy : BoolTy);

	ObjCConstantStringType = QualType();

	ObjCSuperType = QualType();

	// void * type
	if (LangOpts.OpenCLVersion >= 200) {
	auto Q = VoidTy.getQualifiers();
	Q.setAddressSpace(LangAS::opencl_generic);
	VoidPtrTy = getPointerType(getCanonicalType(
	getQualifiedType(VoidTy.getUnqualifiedType(), Q)));
	} else {
	VoidPtrTy = getPointerType(VoidTy);
	}

	// nullptr type (C++0x 2.14.7)
	InitBuiltinType(NullPtrTy, BuiltinType::NullPtr);

	// half type (OpenCL 6.1.1.1) / ARM NEON __fp16
	InitBuiltinType(HalfTy, BuiltinType::Half);

	InitBuiltinType(BFloat16Ty, BuiltinType::BFloat16);

	// Builtin type used to help define __builtin_va_list.
	VaListTagDecl = nullptr;

	// MSVC predeclares struct _GUID, and we need it to create MSGuidDecls.
	if (LangOpts.MicrosoftExt \|\| LangOpts.Borland) {
	MSGuidTagDecl = buildImplicitRecord("_GUID");
	TUDecl->addDecl(MSGuidTagDecl);
	}
	}

	DiagnosticsEngine &ASTContext::getDiagnostics() const {
	return SourceMgr.getDiagnostics();
	}

	AttrVec& ASTContext::getDeclAttrs(const Decl *D) {
	AttrVec *&Result = DeclAttrs[D];
	if (!Result) {
	void *Mem = Allocate(sizeof(AttrVec));
	Result = new (Mem) AttrVec;
	}

	return *Result;
	}

	/// Erase the attributes corresponding to the given declaration.
	void ASTContext::eraseDeclAttrs(const Decl *D) {
	llvm::DenseMap<const Decl, AttrVec>::iterator Pos = DeclAttrs.find(D);
	if (Pos != DeclAttrs.end()) {
	Pos->second->~AttrVec();
	DeclAttrs.erase(Pos);
	}
	}

	// FIXME: Remove ?
	MemberSpecializationInfo *
	ASTContext::getInstantiatedFromStaticDataMember(const VarDecl *Var) {
	assert(Var->isStaticDataMember() && "Not a static data member");
	return getTemplateOrSpecializationInfo(Var)
	.dyn_cast<MemberSpecializationInfo *>();
	}

	ASTContext::TemplateOrSpecializationInfo
	ASTContext::getTemplateOrSpecializationInfo(const VarDecl *Var) {
	llvm::DenseMap<const VarDecl *, TemplateOrSpecializationInfo>::iterator Pos =
	TemplateOrInstantiation.find(Var);
	if (Pos == TemplateOrInstantiation.end())
	return {};

	return Pos->second;
	}

	void
	ASTContext::setInstantiatedFromStaticDataMember(VarDecl Inst, VarDecl Tmpl,
	TemplateSpecializationKind TSK,
	SourceLocation PointOfInstantiation) {
	assert(Inst->isStaticDataMember() && "Not a static data member");
	assert(Tmpl->isStaticDataMember() && "Not a static data member");
	setTemplateOrSpecializationInfo(Inst, new (*this) MemberSpecializationInfo(
	Tmpl, TSK, PointOfInstantiation));
	}

	void
	ASTContext::setTemplateOrSpecializationInfo(VarDecl *Inst,
	TemplateOrSpecializationInfo TSI) {
	assert(!TemplateOrInstantiation[Inst] &&
	"Already noted what the variable was instantiated from");
	TemplateOrInstantiation[Inst] = TSI;
	}

	NamedDecl *
	ASTContext::getInstantiatedFromUsingDecl(NamedDecl *UUD) {
	auto Pos = InstantiatedFromUsingDecl.find(UUD);
	if (Pos == InstantiatedFromUsingDecl.end())
	return nullptr;

	return Pos->second;
	}

	void
	ASTContext::setInstantiatedFromUsingDecl(NamedDecl Inst, NamedDecl Pattern) {
	assert((isa<UsingDecl>(Pattern) \|\|
	isa<UnresolvedUsingValueDecl>(Pattern) \|\|
	isa<UnresolvedUsingTypenameDecl>(Pattern)) &&
	"pattern decl is not a using decl");
	assert((isa<UsingDecl>(Inst) \|\|
	isa<UnresolvedUsingValueDecl>(Inst) \|\|
	isa<UnresolvedUsingTypenameDecl>(Inst)) &&
	"instantiation did not produce a using decl");
	assert(!InstantiatedFromUsingDecl[Inst] && "pattern already exists");
	InstantiatedFromUsingDecl[Inst] = Pattern;
	}

	UsingShadowDecl *
	ASTContext::getInstantiatedFromUsingShadowDecl(UsingShadowDecl *Inst) {
	llvm::DenseMap<UsingShadowDecl, UsingShadowDecl>::const_iterator Pos
	= InstantiatedFromUsingShadowDecl.find(Inst);
	if (Pos == InstantiatedFromUsingShadowDecl.end())
	return nullptr;

	return Pos->second;
	}

	void
	ASTContext::setInstantiatedFromUsingShadowDecl(UsingShadowDecl *Inst,
	UsingShadowDecl *Pattern) {
	assert(!InstantiatedFromUsingShadowDecl[Inst] && "pattern already exists");
	InstantiatedFromUsingShadowDecl[Inst] = Pattern;
	}

	FieldDecl ASTContext::getInstantiatedFromUnnamedFieldDecl(FieldDecl Field) {
	llvm::DenseMap<FieldDecl , FieldDecl >::iterator Pos
	= InstantiatedFromUnnamedFieldDecl.find(Field);
	if (Pos == InstantiatedFromUnnamedFieldDecl.end())
	return nullptr;

	return Pos->second;
	}

	void ASTContext::setInstantiatedFromUnnamedFieldDecl(FieldDecl *Inst,
	FieldDecl *Tmpl) {
	assert(!Inst->getDeclName() && "Instantiated field decl is not unnamed");
	assert(!Tmpl->getDeclName() && "Template field decl is not unnamed");
	assert(!InstantiatedFromUnnamedFieldDecl[Inst] &&
	"Already noted what unnamed field was instantiated from");

	InstantiatedFromUnnamedFieldDecl[Inst] = Tmpl;
	}

	ASTContext::overridden_cxx_method_iterator
	ASTContext::overridden_methods_begin(const CXXMethodDecl *Method) const {
	return overridden_methods(Method).begin();
	}

	ASTContext::overridden_cxx_method_iterator
	ASTContext::overridden_methods_end(const CXXMethodDecl *Method) const {
	return overridden_methods(Method).end();
	}

	unsigned
	ASTContext::overridden_methods_size(const CXXMethodDecl *Method) const {
	auto Range = overridden_methods(Method);
	return Range.end() - Range.begin();
	}

	ASTContext::overridden_method_range
	ASTContext::overridden_methods(const CXXMethodDecl *Method) const {
	llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos =
	OverriddenMethods.find(Method->getCanonicalDecl());
	if (Pos == OverriddenMethods.end())
	return overridden_method_range(nullptr, nullptr);
	return overridden_method_range(Pos->second.begin(), Pos->second.end());
	}

	void ASTContext::addOverriddenMethod(const CXXMethodDecl *Method,
	const CXXMethodDecl *Overridden) {
	assert(Method->isCanonicalDecl() && Overridden->isCanonicalDecl());
	OverriddenMethods[Method].push_back(Overridden);
	}

	void ASTContext::getOverriddenMethods(
	const NamedDecl *D,
	SmallVectorImpl<const NamedDecl *> &Overridden) const {
	assert(D);

	if (const auto *CXXMethod = dyn_cast<CXXMethodDecl>(D)) {
	Overridden.append(overridden_methods_begin(CXXMethod),
	overridden_methods_end(CXXMethod));
	return;
	}

	const auto *Method = dyn_cast<ObjCMethodDecl>(D);
	if (!Method)
	return;

	SmallVector<const ObjCMethodDecl *, 8> OverDecls;
	Method->getOverriddenMethods(OverDecls);
	Overridden.append(OverDecls.begin(), OverDecls.end());
	}

	void ASTContext::addedLocalImportDecl(ImportDecl *Import) {
	assert(!Import->getNextLocalImport() &&
	"Import declaration already in the chain");
	assert(!Import->isFromASTFile() && "Non-local import declaration");
	if (!FirstLocalImport) {
	FirstLocalImport = Import;
	LastLocalImport = Import;
	return;
	}

	LastLocalImport->setNextLocalImport(Import);
	LastLocalImport = Import;
	}

	//===----------------------------------------------------------------------===//
	// Type Sizing and Analysis
	//===----------------------------------------------------------------------===//

	/// getFloatTypeSemantics - Return the APFloat 'semantics' for the specified
	/// scalar floating point type.
	const llvm::fltSemantics &ASTContext::getFloatTypeSemantics(QualType T) const {
	switch (T->castAs<BuiltinType>()->getKind()) {
	default:
	llvm_unreachable("Not a floating point type!");
	case BuiltinType::BFloat16:
	return Target->getBFloat16Format();
	case BuiltinType::Float16:
	case BuiltinType::Half:
	return Target->getHalfFormat();
	case BuiltinType::Float: return Target->getFloatFormat();
	case BuiltinType::Double: return Target->getDoubleFormat();
	case BuiltinType::LongDouble:
	if (getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice)
	return AuxTarget->getLongDoubleFormat();
	return Target->getLongDoubleFormat();
	case BuiltinType::Float128:
	if (getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice)
	return AuxTarget->getFloat128Format();
	return Target->getFloat128Format();
	}
	}

	CharUnits ASTContext::getDeclAlign(const Decl *D, bool ForAlignof) const {
	unsigned Align = Target->getCharWidth();

	bool UseAlignAttrOnly = false;
	if (unsigned AlignFromAttr = D->getMaxAlignment()) {
	Align = AlignFromAttr;

	// __attribute__((aligned)) can increase or decrease alignment
	// except on a struct or struct member, where it only increases
	// alignment unless 'packed' is also specified.
	//
	// It is an error for alignas to decrease alignment, so we can
	// ignore that possibility; Sema should diagnose it.
	if (isa<FieldDecl>(D)) {
	UseAlignAttrOnly = D->hasAttr<PackedAttr>() \|\|
	cast<FieldDecl>(D)->getParent()->hasAttr<PackedAttr>();
	} else {
	UseAlignAttrOnly = true;
	}
	}
	else if (isa<FieldDecl>(D))
	UseAlignAttrOnly =
	D->hasAttr<PackedAttr>() \|\|
	cast<FieldDecl>(D)->getParent()->hasAttr<PackedAttr>();

	// If we're using the align attribute only, just ignore everything
	// else about the declaration and its type.
	if (UseAlignAttrOnly) {
	// do nothing
	} else if (const auto *VD = dyn_cast<ValueDecl>(D)) {
	QualType T = VD->getType();
	if (const auto *RT = T->getAs<ReferenceType>()) {
	if (ForAlignof)
	T = RT->getPointeeType();
	else
	T = getPointerType(RT->getPointeeType());
	}
	QualType BaseT = getBaseElementType(T);
	if (T->isFunctionType())
	Align = getTypeInfoImpl(T.getTypePtr()).Align;
	else if (!BaseT->isIncompleteType()) {
	// Adjust alignments of declarations with array type by the
	// large-array alignment on the target.
	if (const ArrayType *arrayType = getAsArrayType(T)) {
	unsigned MinWidth = Target->getLargeArrayMinWidth();
	if (!ForAlignof && MinWidth) {
	if (isa<VariableArrayType>(arrayType))
	Align = std::max(Align, Target->getLargeArrayAlign());
	else if (isa<ConstantArrayType>(arrayType) &&
	MinWidth <= getTypeSize(cast<ConstantArrayType>(arrayType)))
	Align = std::max(Align, Target->getLargeArrayAlign());
	}
	}
	Align = std::max(Align, getPreferredTypeAlign(T.getTypePtr()));
	if (BaseT.getQualifiers().hasUnaligned())
	Align = Target->getCharWidth();
	if (const auto *VD = dyn_cast<VarDecl>(D)) {
	if (VD->hasGlobalStorage() && !ForAlignof) {
	uint64_t TypeSize = getTypeSize(T.getTypePtr());
	Align = std::max(Align, getTargetInfo().getMinGlobalAlign(TypeSize));
	}
	}
	}

	// Fields can be subject to extra alignment constraints, like if
	// the field is packed, the struct is packed, or the struct has a
	// a max-field-alignment constraint (#pragma pack). So calculate
	// the actual alignment of the field within the struct, and then
	// (as we're expected to) constrain that by the alignment of the type.
	if (const auto *Field = dyn_cast<FieldDecl>(VD)) {
	const RecordDecl *Parent = Field->getParent();
	// We can only produce a sensible answer if the record is valid.
	if (!Parent->isInvalidDecl()) {
	const ASTRecordLayout &Layout = getASTRecordLayout(Parent);

	// Start with the record's overall alignment.
	unsigned FieldAlign = toBits(Layout.getAlignment());

	// Use the GCD of that and the offset within the record.
	uint64_t Offset = Layout.getFieldOffset(Field->getFieldIndex());
	if (Offset > 0) {
	// Alignment is always a power of 2, so the GCD will be a power of 2,
	// which means we get to do this crazy thing instead of Euclid's.
	uint64_t LowBitOfOffset = Offset & (~Offset + 1);
	if (LowBitOfOffset < FieldAlign)
	FieldAlign = static_cast<unsigned>(LowBitOfOffset);
	}

	Align = std::min(Align, FieldAlign);
	}
	}
	}

	return toCharUnitsFromBits(Align);
	}

	CharUnits ASTContext::getExnObjectAlignment() const {
	return toCharUnitsFromBits(Target->getExnObjectAlignment());
	}

	// getTypeInfoDataSizeInChars - Return the size of a type, in
	// chars. If the type is a record, its data size is returned. This is
	// the size of the memcpy that's performed when assigning this type
	// using a trivial copy/move assignment operator.
	std::pair<CharUnits, CharUnits>
	ASTContext::getTypeInfoDataSizeInChars(QualType T) const {
	std::pair<CharUnits, CharUnits> sizeAndAlign = getTypeInfoInChars(T);

	// In C++, objects can sometimes be allocated into the tail padding
	// of a base-class subobject. We decide whether that's possible
	// during class layout, so here we can just trust the layout results.
	if (getLangOpts().CPlusPlus) {
	if (const auto *RT = T->getAs<RecordType>()) {
	const ASTRecordLayout &layout = getASTRecordLayout(RT->getDecl());
	sizeAndAlign.first = layout.getDataSize();
	}
	}

	return sizeAndAlign;
	}

	/// getConstantArrayInfoInChars - Performing the computation in CharUnits
	/// instead of in bits prevents overflowing the uint64_t for some large arrays.
	std::pair<CharUnits, CharUnits>
	static getConstantArrayInfoInChars(const ASTContext &Context,
	const ConstantArrayType *CAT) {
	std::pair<CharUnits, CharUnits> EltInfo =
	Context.getTypeInfoInChars(CAT->getElementType());
	uint64_t Size = CAT->getSize().getZExtValue();
	assert((Size == 0 \|\| static_cast<uint64_t>(EltInfo.first.getQuantity()) <=
	(uint64_t)(-1)/Size) &&
	"Overflow in array type char size evaluation");
	uint64_t Width = EltInfo.first.getQuantity() * Size;
	unsigned Align = EltInfo.second.getQuantity();
	if (!Context.getTargetInfo().getCXXABI().isMicrosoft() \|\|
	Context.getTargetInfo().getPointerWidth(0) == 64)
	Width = llvm::alignTo(Width, Align);
	return std::make_pair(CharUnits::fromQuantity(Width),
	CharUnits::fromQuantity(Align));
	}

	std::pair<CharUnits, CharUnits>
	ASTContext::getTypeInfoInChars(const Type *T) const {
	if (const auto *CAT = dyn_cast<ConstantArrayType>(T))
	return getConstantArrayInfoInChars(*this, CAT);
	TypeInfo Info = getTypeInfo(T);
	return std::make_pair(toCharUnitsFromBits(Info.Width),
	toCharUnitsFromBits(Info.Align));
	}

	std::pair<CharUnits, CharUnits>
	ASTContext::getTypeInfoInChars(QualType T) const {
	return getTypeInfoInChars(T.getTypePtr());
	}

	bool ASTContext::isAlignmentRequired(const Type *T) const {
	return getTypeInfo(T).AlignIsRequired;
	}

	bool ASTContext::isAlignmentRequired(QualType T) const {
	return isAlignmentRequired(T.getTypePtr());
	}

	unsigned ASTContext::getTypeAlignIfKnown(QualType T) const {
	// An alignment on a typedef overrides anything else.
	if (const auto *TT = T->getAs<TypedefType>())
	if (unsigned Align = TT->getDecl()->getMaxAlignment())
	return Align;

	// If we have an (array of) complete type, we're done.
	T = getBaseElementType(T);
	if (!T->isIncompleteType())
	return getTypeAlign(T);

	// If we had an array type, its element type might be a typedef
	// type with an alignment attribute.
	if (const auto *TT = T->getAs<TypedefType>())
	if (unsigned Align = TT->getDecl()->getMaxAlignment())
	return Align;

	// Otherwise, see if the declaration of the type had an attribute.
	if (const auto *TT = T->getAs<TagType>())
	return TT->getDecl()->getMaxAlignment();

	return 0;
	}

	TypeInfo ASTContext::getTypeInfo(const Type *T) const {
	TypeInfoMap::iterator I = MemoizedTypeInfo.find(T);
	if (I != MemoizedTypeInfo.end())
	return I->second;

	// This call can invalidate MemoizedTypeInfo[T], so we need a second lookup.
	TypeInfo TI = getTypeInfoImpl(T);
	MemoizedTypeInfo[T] = TI;
	return TI;
	}

	/// getTypeInfoImpl - Return the size of the specified type, in bits. This
	/// method does not work on incomplete types.
	///
	/// FIXME: Pointers into different addr spaces could have different sizes and
	/// alignment requirements: getPointerInfo should take an AddrSpace, this
	/// should take a QualType, &c.
	TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
	uint64_t Width = 0;
	unsigned Align = 8;
	bool AlignIsRequired = false;
	unsigned AS = 0;
	switch (T->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_TYPE(Class, Base)
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) \
	case Type::Class: \
	assert(!T->isDependentType() && "should not see dependent types here"); \
	return getTypeInfo(cast<Class##Type>(T)->desugar().getTypePtr());
	#include "clang/AST/TypeNodes.inc"
	llvm_unreachable("Should not see dependent types");

	case Type::FunctionNoProto:
	case Type::FunctionProto:
	// GCC extension: alignof(function) = 32 bits
	Width = 0;
	Align = 32;
	break;

	case Type::IncompleteArray:
	case Type::VariableArray:
	case Type::ConstantArray: {
	// Model non-constant sized arrays as size zero, but track the alignment.
	uint64_t Size = 0;
	if (const auto *CAT = dyn_cast<ConstantArrayType>(T))
	Size = CAT->getSize().getZExtValue();

	TypeInfo EltInfo = getTypeInfo(cast<ArrayType>(T)->getElementType());
	assert((Size == 0 \|\| EltInfo.Width <= (uint64_t)(-1) / Size) &&
	"Overflow in array type bit size evaluation");
	Width = EltInfo.Width * Size;
	Align = EltInfo.Align;
	AlignIsRequired = EltInfo.AlignIsRequired;
	if (!getTargetInfo().getCXXABI().isMicrosoft() \|\|
	getTargetInfo().getPointerWidth(0) == 64)
	Width = llvm::alignTo(Width, Align);
	break;
	}

	case Type::ExtVector:
	case Type::Vector: {
	const auto *VT = cast<VectorType>(T);
	TypeInfo EltInfo = getTypeInfo(VT->getElementType());
	Width = EltInfo.Width * VT->getNumElements();
	Align = Width;
	// If the alignment is not a power of 2, round up to the next power of 2.
	// This happens for non-power-of-2 length vectors.
	if (Align & (Align-1)) {
	Align = llvm::NextPowerOf2(Align);
	Width = llvm::alignTo(Width, Align);
	}
	// Adjust the alignment based on the target max.
	uint64_t TargetVectorAlign = Target->getMaxVectorAlign();
	if (TargetVectorAlign && TargetVectorAlign < Align)
	Align = TargetVectorAlign;
	break;
	}

	case Type::ConstantMatrix: {
	const auto *MT = cast<ConstantMatrixType>(T);
	TypeInfo ElementInfo = getTypeInfo(MT->getElementType());
	// The internal layout of a matrix value is implementation defined.
	// Initially be ABI compatible with arrays with respect to alignment and
	// size.
	Width = ElementInfo.Width * MT->getNumRows() * MT->getNumColumns();
	Align = ElementInfo.Align;
	break;
	}

	case Type::Builtin:
	switch (cast<BuiltinType>(T)->getKind()) {
	default: llvm_unreachable("Unknown builtin type!");
	case BuiltinType::Void:
	// GCC extension: alignof(void) = 8 bits.
	Width = 0;
	Align = 8;
	break;
	case BuiltinType::Bool:
	Width = Target->getBoolWidth();
	Align = Target->getBoolAlign();
	break;
	case BuiltinType::Char_S:
	case BuiltinType::Char_U:
	case BuiltinType::UChar:
	case BuiltinType::SChar:
	case BuiltinType::Char8:
	Width = Target->getCharWidth();
	Align = Target->getCharAlign();
	break;
	case BuiltinType::WChar_S:
	case BuiltinType::WChar_U:
	Width = Target->getWCharWidth();
	Align = Target->getWCharAlign();
	break;
	case BuiltinType::Char16:
	Width = Target->getChar16Width();
	Align = Target->getChar16Align();
	break;
	case BuiltinType::Char32:
	Width = Target->getChar32Width();
	Align = Target->getChar32Align();
	break;
	case BuiltinType::UShort:
	case BuiltinType::Short:
	Width = Target->getShortWidth();
	Align = Target->getShortAlign();
	break;
	case BuiltinType::UInt:
	case BuiltinType::Int:
	Width = Target->getIntWidth();
	Align = Target->getIntAlign();
	break;
	case BuiltinType::ULong:
	case BuiltinType::Long:
	Width = Target->getLongWidth();
	Align = Target->getLongAlign();
	break;
	case BuiltinType::ULongLong:
	case BuiltinType::LongLong:
	Width = Target->getLongLongWidth();
	Align = Target->getLongLongAlign();
	break;
	case BuiltinType::Int128:
	case BuiltinType::UInt128:
	Width = 128;
	Align = 128; // int128_t is 128-bit aligned on all targets.
	break;
	case BuiltinType::ShortAccum:
	case BuiltinType::UShortAccum:
	case BuiltinType::SatShortAccum:
	case BuiltinType::SatUShortAccum:
	Width = Target->getShortAccumWidth();
	Align = Target->getShortAccumAlign();
	break;
	case BuiltinType::Accum:
	case BuiltinType::UAccum:
	case BuiltinType::SatAccum:
	case BuiltinType::SatUAccum:
	Width = Target->getAccumWidth();
	Align = Target->getAccumAlign();
	break;
	case BuiltinType::LongAccum:
	case BuiltinType::ULongAccum:
	case BuiltinType::SatLongAccum:
	case BuiltinType::SatULongAccum:
	Width = Target->getLongAccumWidth();
	Align = Target->getLongAccumAlign();
	break;
	case BuiltinType::ShortFract:
	case BuiltinType::UShortFract:
	case BuiltinType::SatShortFract:
	case BuiltinType::SatUShortFract:
	Width = Target->getShortFractWidth();
	Align = Target->getShortFractAlign();
	break;
	case BuiltinType::Fract:
	case BuiltinType::UFract:
	case BuiltinType::SatFract:
	case BuiltinType::SatUFract:
	Width = Target->getFractWidth();
	Align = Target->getFractAlign();
	break;
	case BuiltinType::LongFract:
	case BuiltinType::ULongFract:
	case BuiltinType::SatLongFract:
	case BuiltinType::SatULongFract:
	Width = Target->getLongFractWidth();
	Align = Target->getLongFractAlign();
	break;
	case BuiltinType::BFloat16:
	Width = Target->getBFloat16Width();
	Align = Target->getBFloat16Align();
	break;
	case BuiltinType::Float16:
	case BuiltinType::Half:
	if (Target->hasFloat16Type() \|\| !getLangOpts().OpenMP \|\|
	!getLangOpts().OpenMPIsDevice) {
	Width = Target->getHalfWidth();
	Align = Target->getHalfAlign();
	} else {
	assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice &&
	"Expected OpenMP device compilation.");
	Width = AuxTarget->getHalfWidth();
	Align = AuxTarget->getHalfAlign();
	}
	break;
	case BuiltinType::Float:
	Width = Target->getFloatWidth();
	Align = Target->getFloatAlign();
	break;
	case BuiltinType::Double:
	Width = Target->getDoubleWidth();
	Align = Target->getDoubleAlign();
	break;
	case BuiltinType::LongDouble:
	if (getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice &&
	(Target->getLongDoubleWidth() != AuxTarget->getLongDoubleWidth() \|\|
	Target->getLongDoubleAlign() != AuxTarget->getLongDoubleAlign())) {
	Width = AuxTarget->getLongDoubleWidth();
	Align = AuxTarget->getLongDoubleAlign();
	} else {
	Width = Target->getLongDoubleWidth();
	Align = Target->getLongDoubleAlign();
	}
	break;
	case BuiltinType::Float128:
	if (Target->hasFloat128Type() \|\| !getLangOpts().OpenMP \|\|
	!getLangOpts().OpenMPIsDevice) {
	Width = Target->getFloat128Width();
	Align = Target->getFloat128Align();
	} else {
	assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice &&
	"Expected OpenMP device compilation.");
	Width = AuxTarget->getFloat128Width();
	Align = AuxTarget->getFloat128Align();
	}
	break;
	case BuiltinType::NullPtr:
	Width = Target->getPointerWidth(0); // C++ 3.9.1p11: sizeof(nullptr_t)
	Align = Target->getPointerAlign(0); // == sizeof(void*)
	break;
	case BuiltinType::ObjCId:
	case BuiltinType::ObjCClass:
	case BuiltinType::ObjCSel:
	Width = Target->getPointerWidth(0);
	Align = Target->getPointerAlign(0);
	break;
	case BuiltinType::OCLSampler:
	case BuiltinType::OCLEvent:
	case BuiltinType::OCLClkEvent:
	case BuiltinType::OCLQueue:
	case BuiltinType::OCLReserveID:
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	#define EXT_OPAQUE_TYPE(ExtType, Id, Ext) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLExtensionTypes.def"
	AS = getTargetAddressSpace(
	Target->getOpenCLTypeAddrSpace(getOpenCLTypeKind(T)));
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	// The SVE types are effectively target-specific. The length of an
	// SVE_VECTOR_TYPE is only known at runtime, but it is always a multiple
	// of 128 bits. There is one predicate bit for each vector byte, so the
	// length of an SVE_PREDICATE_TYPE is always a multiple of 16 bits.
	//
	// Because the length is only known at runtime, we use a dummy value
	// of 0 for the static length. The alignment values are those defined
	// by the Procedure Call Standard for the Arm Architecture.
	#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits, \
	IsSigned, IsFP, IsBF) \
	case BuiltinType::Id: \
	Width = 0; \
	Align = 128; \
	break;
	#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls) \
	case BuiltinType::Id: \
	Width = 0; \
	Align = 16; \
	break;
	#include "clang/Basic/AArch64SVEACLETypes.def"
	}
	break;
	case Type::ObjCObjectPointer:
	Width = Target->getPointerWidth(0);
	Align = Target->getPointerAlign(0);
	break;
	case Type::BlockPointer:
	AS = getTargetAddressSpace(cast<BlockPointerType>(T)->getPointeeType());
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	case Type::LValueReference:
	case Type::RValueReference:
	// alignof and sizeof should never enter this code path here, so we go
	// the pointer route.
	AS = getTargetAddressSpace(cast<ReferenceType>(T)->getPointeeType());
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	case Type::Pointer:
	AS = getTargetAddressSpace(cast<PointerType>(T)->getPointeeType());
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	case Type::MemberPointer: {
	const auto *MPT = cast<MemberPointerType>(T);
	CXXABI::MemberPointerInfo MPI = ABI->getMemberPointerInfo(MPT);
	Width = MPI.Width;
	Align = MPI.Align;
	break;
	}
	case Type::Complex: {
	// Complex types have the same alignment as their elements, but twice the
	// size.
	TypeInfo EltInfo = getTypeInfo(cast<ComplexType>(T)->getElementType());
	Width = EltInfo.Width * 2;
	Align = EltInfo.Align;
	break;
	}
	case Type::ObjCObject:
	return getTypeInfo(cast<ObjCObjectType>(T)->getBaseType().getTypePtr());
	case Type::Adjusted:
	case Type::Decayed:
	return getTypeInfo(cast<AdjustedType>(T)->getAdjustedType().getTypePtr());
	case Type::ObjCInterface: {
	const auto *ObjCI = cast<ObjCInterfaceType>(T);
	if (ObjCI->getDecl()->isInvalidDecl()) {
	Width = 8;
	Align = 8;
	break;
	}
	const ASTRecordLayout &Layout = getASTObjCInterfaceLayout(ObjCI->getDecl());
	Width = toBits(Layout.getSize());
	Align = toBits(Layout.getAlignment());
	break;
	}
	case Type::ExtInt: {
	const auto *EIT = cast<ExtIntType>(T);
	Align =
	std::min(static_cast<unsigned>(std::max(
	getCharWidth(), llvm::PowerOf2Ceil(EIT->getNumBits()))),
	Target->getLongLongAlign());
	Width = llvm::alignTo(EIT->getNumBits(), Align);
	break;
	}
	case Type::Record:
	case Type::Enum: {
	const auto *TT = cast<TagType>(T);

	if (TT->getDecl()->isInvalidDecl()) {
	Width = 8;
	Align = 8;
	break;
	}

	if (const auto *ET = dyn_cast<EnumType>(TT)) {
	const EnumDecl *ED = ET->getDecl();
	TypeInfo Info =
	getTypeInfo(ED->getIntegerType()->getUnqualifiedDesugaredType());
	if (unsigned AttrAlign = ED->getMaxAlignment()) {
	Info.Align = AttrAlign;
	Info.AlignIsRequired = true;
	}
	return Info;
	}

	const auto *RT = cast<RecordType>(TT);
	const RecordDecl *RD = RT->getDecl();
	const ASTRecordLayout &Layout = getASTRecordLayout(RD);
	Width = toBits(Layout.getSize());
	Align = toBits(Layout.getAlignment());
	AlignIsRequired = RD->hasAttr<AlignedAttr>();
	break;
	}

	case Type::SubstTemplateTypeParm:
	return getTypeInfo(cast<SubstTemplateTypeParmType>(T)->
	getReplacementType().getTypePtr());

	case Type::Auto:
	case Type::DeducedTemplateSpecialization: {
	const auto *A = cast<DeducedType>(T);
	assert(!A->getDeducedType().isNull() &&
	"cannot request the size of an undeduced or dependent auto type");
	return getTypeInfo(A->getDeducedType().getTypePtr());
	}

	case Type::Paren:
	return getTypeInfo(cast<ParenType>(T)->getInnerType().getTypePtr());

	case Type::MacroQualified:
	return getTypeInfo(
	cast<MacroQualifiedType>(T)->getUnderlyingType().getTypePtr());

	case Type::ObjCTypeParam:
	return getTypeInfo(cast<ObjCTypeParamType>(T)->desugar().getTypePtr());

	case Type::Typedef: {
	const TypedefNameDecl *Typedef = cast<TypedefType>(T)->getDecl();
	TypeInfo Info = getTypeInfo(Typedef->getUnderlyingType().getTypePtr());
	// If the typedef has an aligned attribute on it, it overrides any computed
	// alignment we have. This violates the GCC documentation (which says that
	// attribute(aligned) can only round up) but matches its implementation.
	if (unsigned AttrAlign = Typedef->getMaxAlignment()) {
	Align = AttrAlign;
	AlignIsRequired = true;
	} else {
	Align = Info.Align;
	AlignIsRequired = Info.AlignIsRequired;
	}
	Width = Info.Width;
	break;
	}

	case Type::Elaborated:
	return getTypeInfo(cast<ElaboratedType>(T)->getNamedType().getTypePtr());

	case Type::Attributed:
	return getTypeInfo(
	cast<AttributedType>(T)->getEquivalentType().getTypePtr());

	case Type::Atomic: {
	// Start with the base type information.
	TypeInfo Info = getTypeInfo(cast<AtomicType>(T)->getValueType());
	Width = Info.Width;
	Align = Info.Align;

	if (!Width) {
	// An otherwise zero-sized type should still generate an
	// atomic operation.
	Width = Target->getCharWidth();
	assert(Align);
	} else if (Width <= Target->getMaxAtomicPromoteWidth()) {
	// If the size of the type doesn't exceed the platform's max
	// atomic promotion width, make the size and alignment more
	// favorable to atomic operations:

	// Round the size up to a power of 2.
	if (!llvm::isPowerOf2_64(Width))
	Width = llvm::NextPowerOf2(Width);

	// Set the alignment equal to the size.
	Align = static_cast<unsigned>(Width);
	}
	}
	break;

	case Type::Pipe:
	Width = Target->getPointerWidth(getTargetAddressSpace(LangAS::opencl_global));
	Align = Target->getPointerAlign(getTargetAddressSpace(LangAS::opencl_global));
	break;
	}

	assert(llvm::isPowerOf2_32(Align) && "Alignment must be power of 2");
	return TypeInfo(Width, Align, AlignIsRequired);
	}

	unsigned ASTContext::getTypeUnadjustedAlign(const Type *T) const {
	UnadjustedAlignMap::iterator I = MemoizedUnadjustedAlign.find(T);
	if (I != MemoizedUnadjustedAlign.end())
	return I->second;

	unsigned UnadjustedAlign;
	if (const auto *RT = T->getAs<RecordType>()) {
	const RecordDecl *RD = RT->getDecl();
	const ASTRecordLayout &Layout = getASTRecordLayout(RD);
	UnadjustedAlign = toBits(Layout.getUnadjustedAlignment());
	} else if (const auto *ObjCI = T->getAs<ObjCInterfaceType>()) {
	const ASTRecordLayout &Layout = getASTObjCInterfaceLayout(ObjCI->getDecl());
	UnadjustedAlign = toBits(Layout.getUnadjustedAlignment());
	} else {
	UnadjustedAlign = getTypeAlign(T->getUnqualifiedDesugaredType());
	}

	MemoizedUnadjustedAlign[T] = UnadjustedAlign;
	return UnadjustedAlign;
	}

	unsigned ASTContext::getOpenMPDefaultSimdAlign(QualType T) const {
	unsigned SimdAlign = getTargetInfo().getSimdDefaultAlign();
	// Target ppc64 with QPX: simd default alignment for pointer to double is 32.
	if ((getTargetInfo().getTriple().getArch() == llvm::Triple::ppc64 \|\|
	getTargetInfo().getTriple().getArch() == llvm::Triple::ppc64le) &&
	getTargetInfo().getABI() == "elfv1-qpx" &&
	T->isSpecificBuiltinType(BuiltinType::Double))
	SimdAlign = 256;
	return SimdAlign;
	}

	/// toCharUnitsFromBits - Convert a size in bits to a size in characters.
	CharUnits ASTContext::toCharUnitsFromBits(int64_t BitSize) const {
	return CharUnits::fromQuantity(BitSize / getCharWidth());
	}

	/// toBits - Convert a size in characters to a size in characters.
	int64_t ASTContext::toBits(CharUnits CharSize) const {
	return CharSize.getQuantity() * getCharWidth();
	}

	/// getTypeSizeInChars - Return the size of the specified type, in characters.
	/// This method does not work on incomplete types.
	CharUnits ASTContext::getTypeSizeInChars(QualType T) const {
	return getTypeInfoInChars(T).first;
	}
	CharUnits ASTContext::getTypeSizeInChars(const Type *T) const {
	return getTypeInfoInChars(T).first;
	}

	/// getTypeAlignInChars - Return the ABI-specified alignment of a type, in
	/// characters. This method does not work on incomplete types.
	CharUnits ASTContext::getTypeAlignInChars(QualType T) const {
	return toCharUnitsFromBits(getTypeAlign(T));
	}
	CharUnits ASTContext::getTypeAlignInChars(const Type *T) const {
	return toCharUnitsFromBits(getTypeAlign(T));
	}

	/// getTypeUnadjustedAlignInChars - Return the ABI-specified alignment of a
	/// type, in characters, before alignment adustments. This method does
	/// not work on incomplete types.
	CharUnits ASTContext::getTypeUnadjustedAlignInChars(QualType T) const {
	return toCharUnitsFromBits(getTypeUnadjustedAlign(T));
	}
	CharUnits ASTContext::getTypeUnadjustedAlignInChars(const Type *T) const {
	return toCharUnitsFromBits(getTypeUnadjustedAlign(T));
	}

	/// getPreferredTypeAlign - Return the "preferred" alignment of the specified
	/// type for the current target in bits. This can be different than the ABI
	/// alignment in cases where it is beneficial for performance to overalign
	/// a data type.
	unsigned ASTContext::getPreferredTypeAlign(const Type *T) const {
	TypeInfo TI = getTypeInfo(T);
	unsigned ABIAlign = TI.Align;

	T = T->getBaseElementTypeUnsafe();

	// The preferred alignment of member pointers is that of a pointer.
	if (T->isMemberPointerType())
	return getPreferredTypeAlign(getPointerDiffType().getTypePtr());

	if (!Target->allowsLargerPreferedTypeAlignment())
	return ABIAlign;

	// Double and long long should be naturally aligned if possible.
	if (const auto *CT = T->getAs<ComplexType>())
	T = CT->getElementType().getTypePtr();
	if (const auto *ET = T->getAs<EnumType>())
	T = ET->getDecl()->getIntegerType().getTypePtr();
	if (T->isSpecificBuiltinType(BuiltinType::Double) \|\|
	T->isSpecificBuiltinType(BuiltinType::LongLong) \|\|
	T->isSpecificBuiltinType(BuiltinType::ULongLong))
	// Don't increase the alignment if an alignment attribute was specified on a
	// typedef declaration.
	if (!TI.AlignIsRequired)
	return std::max(ABIAlign, (unsigned)getTypeSize(T));

	return ABIAlign;
	}

	/// getTargetDefaultAlignForAttributeAligned - Return the default alignment
	/// for __attribute__((aligned)) on this target, to be used if no alignment
	/// value is specified.
	unsigned ASTContext::getTargetDefaultAlignForAttributeAligned() const {
	return getTargetInfo().getDefaultAlignForAttributeAligned();
	}

	/// getAlignOfGlobalVar - Return the alignment in bits that should be given
	/// to a global variable of the specified type.
	unsigned ASTContext::getAlignOfGlobalVar(QualType T) const {
	uint64_t TypeSize = getTypeSize(T.getTypePtr());
	return std::max(getTypeAlign(T), getTargetInfo().getMinGlobalAlign(TypeSize));
	}

	/// getAlignOfGlobalVarInChars - Return the alignment in characters that
	/// should be given to a global variable of the specified type.
	CharUnits ASTContext::getAlignOfGlobalVarInChars(QualType T) const {
	return toCharUnitsFromBits(getAlignOfGlobalVar(T));
	}

	CharUnits ASTContext::getOffsetOfBaseWithVBPtr(const CXXRecordDecl *RD) const {
	CharUnits Offset = CharUnits::Zero();
	const ASTRecordLayout *Layout = &getASTRecordLayout(RD);
	while (const CXXRecordDecl *Base = Layout->getBaseSharingVBPtr()) {
	Offset += Layout->getBaseClassOffset(Base);
	Layout = &getASTRecordLayout(Base);
	}
	return Offset;
	}

	/// DeepCollectObjCIvars -
	/// This routine first collects all declared, but not synthesized, ivars in
	/// super class and then collects all ivars, including those synthesized for
	/// current class. This routine is used for implementation of current class
	/// when all ivars, declared and synthesized are known.
	void ASTContext::DeepCollectObjCIvars(const ObjCInterfaceDecl *OI,
	bool leafClass,
	SmallVectorImpl<const ObjCIvarDecl*> &Ivars) const {
	if (const ObjCInterfaceDecl *SuperClass = OI->getSuperClass())
	DeepCollectObjCIvars(SuperClass, false, Ivars);
	if (!leafClass) {
	for (const auto *I : OI->ivars())
	Ivars.push_back(I);
	} else {
	auto IDecl = const_cast<ObjCInterfaceDecl >(OI);
	for (const ObjCIvarDecl *Iv = IDecl->all_declared_ivar_begin(); Iv;
	Iv= Iv->getNextIvar())
	Ivars.push_back(Iv);
	}
	}

	/// CollectInheritedProtocols - Collect all protocols in current class and
	/// those inherited by it.
	void ASTContext::CollectInheritedProtocols(const Decl *CDecl,
	llvm::SmallPtrSet<ObjCProtocolDecl*, 8> &Protocols) {
	if (const auto *OI = dyn_cast<ObjCInterfaceDecl>(CDecl)) {
	// We can use protocol_iterator here instead of
	// all_referenced_protocol_iterator since we are walking all categories.
	for (auto *Proto : OI->all_referenced_protocols()) {
	CollectInheritedProtocols(Proto, Protocols);
	}

	// Categories of this Interface.
	for (const auto *Cat : OI->visible_categories())
	CollectInheritedProtocols(Cat, Protocols);

	if (ObjCInterfaceDecl *SD = OI->getSuperClass())
	while (SD) {
	CollectInheritedProtocols(SD, Protocols);
	SD = SD->getSuperClass();
	}
	} else if (const auto *OC = dyn_cast<ObjCCategoryDecl>(CDecl)) {
	for (auto *Proto : OC->protocols()) {
	CollectInheritedProtocols(Proto, Protocols);
	}
	} else if (const auto *OP = dyn_cast<ObjCProtocolDecl>(CDecl)) {
	// Insert the protocol.
	if (!Protocols.insert(
	const_cast<ObjCProtocolDecl *>(OP->getCanonicalDecl())).second)
	return;

	for (auto *Proto : OP->protocols())
	CollectInheritedProtocols(Proto, Protocols);
	}
	}

	static bool unionHasUniqueObjectRepresentations(const ASTContext &Context,
	const RecordDecl *RD) {
	assert(RD->isUnion() && "Must be union type");
	CharUnits UnionSize = Context.getTypeSizeInChars(RD->getTypeForDecl());

	for (const auto *Field : RD->fields()) {
	if (!Context.hasUniqueObjectRepresentations(Field->getType()))
	return false;
	CharUnits FieldSize = Context.getTypeSizeInChars(Field->getType());
	if (FieldSize != UnionSize)
	return false;
	}
	return !RD->field_empty();
	}

	static bool isStructEmpty(QualType Ty) {
	const RecordDecl *RD = Ty->castAs<RecordType>()->getDecl();

	if (!RD->field_empty())
	return false;

	if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RD))
	return ClassDecl->isEmpty();

	return true;
	}

	static llvm::Optional<int64_t>
	structHasUniqueObjectRepresentations(const ASTContext &Context,
	const RecordDecl *RD) {
	assert(!RD->isUnion() && "Must be struct/class type");
	const auto &Layout = Context.getASTRecordLayout(RD);

	int64_t CurOffsetInBits = 0;
	if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RD)) {
	if (ClassDecl->isDynamicClass())
	return llvm::None;

	SmallVector<std::pair<QualType, int64_t>, 4> Bases;
	for (const auto &Base : ClassDecl->bases()) {
	// Empty types can be inherited from, and non-empty types can potentially
	// have tail padding, so just make sure there isn't an error.
	if (!isStructEmpty(Base.getType())) {
	llvm::Optional<int64_t> Size = structHasUniqueObjectRepresentations(
	Context, Base.getType()->castAs<RecordType>()->getDecl());
	if (!Size)
	return llvm::None;
	Bases.emplace_back(Base.getType(), Size.getValue());
	}
	}

	llvm::sort(Bases, [&](const std::pair<QualType, int64_t> &L,
	const std::pair<QualType, int64_t> &R) {
	return Layout.getBaseClassOffset(L.first->getAsCXXRecordDecl()) <
	Layout.getBaseClassOffset(R.first->getAsCXXRecordDecl());
	});

	for (const auto &Base : Bases) {
	int64_t BaseOffset = Context.toBits(
	Layout.getBaseClassOffset(Base.first->getAsCXXRecordDecl()));
	int64_t BaseSize = Base.second;
	if (BaseOffset != CurOffsetInBits)
	return llvm::None;
	CurOffsetInBits = BaseOffset + BaseSize;
	}
	}

	for (const auto *Field : RD->fields()) {
	if (!Field->getType()->isReferenceType() &&
	!Context.hasUniqueObjectRepresentations(Field->getType()))
	return llvm::None;

	int64_t FieldSizeInBits =
	Context.toBits(Context.getTypeSizeInChars(Field->getType()));
	if (Field->isBitField()) {
	int64_t BitfieldSize = Field->getBitWidthValue(Context);

	if (BitfieldSize > FieldSizeInBits)
	return llvm::None;
	FieldSizeInBits = BitfieldSize;
	}

	int64_t FieldOffsetInBits = Context.getFieldOffset(Field);

	if (FieldOffsetInBits != CurOffsetInBits)
	return llvm::None;

	CurOffsetInBits = FieldSizeInBits + FieldOffsetInBits;
	}

	return CurOffsetInBits;
	}

	bool ASTContext::hasUniqueObjectRepresentations(QualType Ty) const {
	// C++17 [meta.unary.prop]:
	// The predicate condition for a template specialization
	// has_unique_object_representations<T> shall be
	// satisfied if and only if:
	// (9.1) - T is trivially copyable, and
	// (9.2) - any two objects of type T with the same value have the same
	// object representation, where two objects
	// of array or non-union class type are considered to have the same value
	// if their respective sequences of
	// direct subobjects have the same values, and two objects of union type
	// are considered to have the same
	// value if they have the same active member and the corresponding members
	// have the same value.
	// The set of scalar types for which this condition holds is
	// implementation-defined. [ Note: If a type has padding
	// bits, the condition does not hold; otherwise, the condition holds true
	// for unsigned integral types. -- end note ]
	assert(!Ty.isNull() && "Null QualType sent to unique object rep check");

	// Arrays are unique only if their element type is unique.
	if (Ty->isArrayType())
	return hasUniqueObjectRepresentations(getBaseElementType(Ty));

	// (9.1) - T is trivially copyable...
	if (!Ty.isTriviallyCopyableType(*this))
	return false;

	// All integrals and enums are unique.
	if (Ty->isIntegralOrEnumerationType())
	return true;

	// All other pointers are unique.
	if (Ty->isPointerType())
	return true;

	if (Ty->isMemberPointerType()) {
	const auto *MPT = Ty->getAs<MemberPointerType>();
	return !ABI->getMemberPointerInfo(MPT).HasPadding;
	}

	if (Ty->isRecordType()) {
	const RecordDecl *Record = Ty->castAs<RecordType>()->getDecl();

	if (Record->isInvalidDecl())
	return false;

	if (Record->isUnion())
	return unionHasUniqueObjectRepresentations(*this, Record);

	Optional<int64_t> StructSize =
	structHasUniqueObjectRepresentations(*this, Record);

	return StructSize &&
	StructSize.getValue() == static_cast<int64_t>(getTypeSize(Ty));
	}

	// FIXME: More cases to handle here (list by rsmith):
	// vectors (careful about, eg, vector of 3 foo)
	// _Complex int and friends
	// _Atomic T
	// Obj-C block pointers
	// Obj-C object pointers
	// and perhaps OpenCL's various builtin types (pipe, sampler_t, event_t,
	// clk_event_t, queue_t, reserve_id_t)
	// There're also Obj-C class types and the Obj-C selector type, but I think it
	// makes sense for those to return false here.

	return false;
	}

	unsigned ASTContext::CountNonClassIvars(const ObjCInterfaceDecl *OI) const {
	unsigned count = 0;
	// Count ivars declared in class extension.
	for (const auto *Ext : OI->known_extensions())
	count += Ext->ivar_size();

	// Count ivar defined in this class's implementation. This
	// includes synthesized ivars.
	if (ObjCImplementationDecl *ImplDecl = OI->getImplementation())
	count += ImplDecl->ivar_size();

	return count;
	}

	bool ASTContext::isSentinelNullExpr(const Expr *E) {
	if (!E)
	return false;

	// nullptr_t is always treated as null.
	if (E->getType()->isNullPtrType()) return true;

	if (E->getType()->isAnyPointerType() &&
	E->IgnoreParenCasts()->isNullPointerConstant(*this,
	Expr::NPC_ValueDependentIsNull))
	return true;

	// Unfortunately, __null has type 'int'.
	if (isa<GNUNullExpr>(E)) return true;

	return false;
	}

	/// Get the implementation of ObjCInterfaceDecl, or nullptr if none
	/// exists.
	ObjCImplementationDecl ASTContext::getObjCImplementation(ObjCInterfaceDecl D) {
	llvm::DenseMap<ObjCContainerDecl, ObjCImplDecl>::iterator
	I = ObjCImpls.find(D);
	if (I != ObjCImpls.end())
	return cast<ObjCImplementationDecl>(I->second);
	return nullptr;
	}

	/// Get the implementation of ObjCCategoryDecl, or nullptr if none
	/// exists.
	ObjCCategoryImplDecl ASTContext::getObjCImplementation(ObjCCategoryDecl D) {
	llvm::DenseMap<ObjCContainerDecl, ObjCImplDecl>::iterator
	I = ObjCImpls.find(D);
	if (I != ObjCImpls.end())
	return cast<ObjCCategoryImplDecl>(I->second);
	return nullptr;
	}

	/// Set the implementation of ObjCInterfaceDecl.
	void ASTContext::setObjCImplementation(ObjCInterfaceDecl *IFaceD,
	ObjCImplementationDecl *ImplD) {
	assert(IFaceD && ImplD && "Passed null params");
	ObjCImpls[IFaceD] = ImplD;
	}

	/// Set the implementation of ObjCCategoryDecl.
	void ASTContext::setObjCImplementation(ObjCCategoryDecl *CatD,
	ObjCCategoryImplDecl *ImplD) {
	assert(CatD && ImplD && "Passed null params");
	ObjCImpls[CatD] = ImplD;
	}

	const ObjCMethodDecl *
	ASTContext::getObjCMethodRedeclaration(const ObjCMethodDecl *MD) const {
	return ObjCMethodRedecls.lookup(MD);
	}

	void ASTContext::setObjCMethodRedeclaration(const ObjCMethodDecl *MD,
	const ObjCMethodDecl *Redecl) {
	assert(!getObjCMethodRedeclaration(MD) && "MD already has a redeclaration");
	ObjCMethodRedecls[MD] = Redecl;
	}

	const ObjCInterfaceDecl *ASTContext::getObjContainingInterface(
	const NamedDecl *ND) const {
	if (const auto *ID = dyn_cast<ObjCInterfaceDecl>(ND->getDeclContext()))
	return ID;
	if (const auto *CD = dyn_cast<ObjCCategoryDecl>(ND->getDeclContext()))
	return CD->getClassInterface();
	if (const auto *IMD = dyn_cast<ObjCImplDecl>(ND->getDeclContext()))
	return IMD->getClassInterface();

	return nullptr;
	}

	/// Get the copy initialization expression of VarDecl, or nullptr if
	/// none exists.
	BlockVarCopyInit ASTContext::getBlockVarCopyInit(const VarDecl *VD) const {
	assert(VD && "Passed null params");
	assert(VD->hasAttr<BlocksAttr>() &&
	"getBlockVarCopyInits - not __block var");
	auto I = BlockVarCopyInits.find(VD);
	if (I != BlockVarCopyInits.end())
	return I->second;
	return {nullptr, false};
	}

	/// Set the copy initialization expression of a block var decl.
	void ASTContext::setBlockVarCopyInit(const VarDeclVD, Expr CopyExpr,
	bool CanThrow) {
	assert(VD && CopyExpr && "Passed null params");
	assert(VD->hasAttr<BlocksAttr>() &&
	"setBlockVarCopyInits - not __block var");
	BlockVarCopyInits[VD].setExprAndFlag(CopyExpr, CanThrow);
	}

	TypeSourceInfo *ASTContext::CreateTypeSourceInfo(QualType T,
	unsigned DataSize) const {
	if (!DataSize)
	DataSize = TypeLoc::getFullDataSizeForType(T);
	else
	assert(DataSize == TypeLoc::getFullDataSizeForType(T) &&
	"incorrect data size provided to CreateTypeSourceInfo!");

	auto *TInfo =
	(TypeSourceInfo*)BumpAlloc.Allocate(sizeof(TypeSourceInfo) + DataSize, 8);
	new (TInfo) TypeSourceInfo(T);
	return TInfo;
	}

	TypeSourceInfo *ASTContext::getTrivialTypeSourceInfo(QualType T,
	SourceLocation L) const {
	TypeSourceInfo *DI = CreateTypeSourceInfo(T);
	DI->getTypeLoc().initialize(const_cast<ASTContext &>(*this), L);
	return DI;
	}

	const ASTRecordLayout &
	ASTContext::getASTObjCInterfaceLayout(const ObjCInterfaceDecl *D) const {
	return getObjCLayout(D, nullptr);
	}

	const ASTRecordLayout &
	ASTContext::getASTObjCImplementationLayout(
	const ObjCImplementationDecl *D) const {
	return getObjCLayout(D->getClassInterface(), D);
	}

	//===----------------------------------------------------------------------===//
	// Type creation/memoization methods
	//===----------------------------------------------------------------------===//

	QualType
	ASTContext::getExtQualType(const Type *baseType, Qualifiers quals) const {
	unsigned fastQuals = quals.getFastQualifiers();
	quals.removeFastQualifiers();

	// Check if we've already instantiated this type.
	llvm::FoldingSetNodeID ID;
	ExtQuals::Profile(ID, baseType, quals);
	void *insertPos = nullptr;
	if (ExtQuals *eq = ExtQualNodes.FindNodeOrInsertPos(ID, insertPos)) {
	assert(eq->getQualifiers() == quals);
	return QualType(eq, fastQuals);
	}

	// If the base type is not canonical, make the appropriate canonical type.
	QualType canon;
	if (!baseType->isCanonicalUnqualified()) {
	SplitQualType canonSplit = baseType->getCanonicalTypeInternal().split();
	canonSplit.Quals.addConsistentQualifiers(quals);
	canon = getExtQualType(canonSplit.Ty, canonSplit.Quals);

	// Re-find the insert position.
	(void) ExtQualNodes.FindNodeOrInsertPos(ID, insertPos);
	}

	auto eq = new (this, TypeAlignment) ExtQuals(baseType, canon, quals);
	ExtQualNodes.InsertNode(eq, insertPos);
	return QualType(eq, fastQuals);
	}

	QualType ASTContext::getAddrSpaceQualType(QualType T,
	LangAS AddressSpace) const {
	QualType CanT = getCanonicalType(T);
	if (CanT.getAddressSpace() == AddressSpace)
	return T;

	// If we are composing extended qualifiers together, merge together
	// into one ExtQuals node.
	QualifierCollector Quals;
	const Type *TypeNode = Quals.strip(T);

	// If this type already has an address space specified, it cannot get
	// another one.
	assert(!Quals.hasAddressSpace() &&
	"Type cannot be in multiple addr spaces!");
	Quals.addAddressSpace(AddressSpace);

	return getExtQualType(TypeNode, Quals);
	}

	QualType ASTContext::removeAddrSpaceQualType(QualType T) const {
	// If we are composing extended qualifiers together, merge together
	// into one ExtQuals node.
	QualifierCollector Quals;
	const Type *TypeNode = Quals.strip(T);

	// If the qualifier doesn't have an address space just return it.
	if (!Quals.hasAddressSpace())
	return T;

	Quals.removeAddressSpace();

	// Removal of the address space can mean there are no longer any
	// non-fast qualifiers, so creating an ExtQualType isn't possible (asserts)
	// or required.
	if (Quals.hasNonFastQualifiers())
	return getExtQualType(TypeNode, Quals);
	else
	return QualType(TypeNode, Quals.getFastQualifiers());
	}

	QualType ASTContext::getObjCGCQualType(QualType T,
	Qualifiers::GC GCAttr) const {
	QualType CanT = getCanonicalType(T);
	if (CanT.getObjCGCAttr() == GCAttr)
	return T;

	if (const auto *ptr = T->getAs<PointerType>()) {
	QualType Pointee = ptr->getPointeeType();
	if (Pointee->isAnyPointerType()) {
	QualType ResultType = getObjCGCQualType(Pointee, GCAttr);
	return getPointerType(ResultType);
	}
	}

	// If we are composing extended qualifiers together, merge together
	// into one ExtQuals node.
	QualifierCollector Quals;
	const Type *TypeNode = Quals.strip(T);

	// If this type already has an ObjCGC specified, it cannot get
	// another one.
	assert(!Quals.hasObjCGCAttr() &&
	"Type cannot have multiple ObjCGCs!");
	Quals.addObjCGCAttr(GCAttr);

	return getExtQualType(TypeNode, Quals);
	}

	QualType ASTContext::removePtrSizeAddrSpace(QualType T) const {
	if (const PointerType *Ptr = T->getAs<PointerType>()) {
	QualType Pointee = Ptr->getPointeeType();
	if (isPtrSizeAddressSpace(Pointee.getAddressSpace())) {
	return getPointerType(removeAddrSpaceQualType(Pointee));
	}
	}
	return T;
	}

	const FunctionType ASTContext::adjustFunctionType(const FunctionType T,
	FunctionType::ExtInfo Info) {
	if (T->getExtInfo() == Info)
	return T;

	QualType Result;
	if (const auto *FNPT = dyn_cast<FunctionNoProtoType>(T)) {
	Result = getFunctionNoProtoType(FNPT->getReturnType(), Info);
	} else {
	const auto *FPT = cast<FunctionProtoType>(T);
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.ExtInfo = Info;
	Result = getFunctionType(FPT->getReturnType(), FPT->getParamTypes(), EPI);
	}

	return cast<FunctionType>(Result.getTypePtr());
	}

	void ASTContext::adjustDeducedFunctionResultType(FunctionDecl *FD,
	QualType ResultType) {
	FD = FD->getMostRecentDecl();
	while (true) {
	const auto *FPT = FD->getType()->castAs<FunctionProtoType>();
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	FD->setType(getFunctionType(ResultType, FPT->getParamTypes(), EPI));
	if (FunctionDecl *Next = FD->getPreviousDecl())
	FD = Next;
	else
	break;
	}
	if (ASTMutationListener *L = getASTMutationListener())
	L->DeducedReturnType(FD, ResultType);
	}

	/// Get a function type and produce the equivalent function type with the
	/// specified exception specification. Type sugar that can be present on a
	/// declaration of a function with an exception specification is permitted
	/// and preserved. Other type sugar (for instance, typedefs) is not.
	QualType ASTContext::getFunctionTypeWithExceptionSpec(
	QualType Orig, const FunctionProtoType::ExceptionSpecInfo &ESI) {
	// Might have some parens.
	if (const auto *PT = dyn_cast<ParenType>(Orig))
	return getParenType(
	getFunctionTypeWithExceptionSpec(PT->getInnerType(), ESI));

	// Might be wrapped in a macro qualified type.
	if (const auto *MQT = dyn_cast<MacroQualifiedType>(Orig))
	return getMacroQualifiedType(
	getFunctionTypeWithExceptionSpec(MQT->getUnderlyingType(), ESI),
	MQT->getMacroIdentifier());

	// Might have a calling-convention attribute.
	if (const auto *AT = dyn_cast<AttributedType>(Orig))
	return getAttributedType(
	AT->getAttrKind(),
	getFunctionTypeWithExceptionSpec(AT->getModifiedType(), ESI),
	getFunctionTypeWithExceptionSpec(AT->getEquivalentType(), ESI));

	// Anything else must be a function type. Rebuild it with the new exception
	// specification.
	const auto *Proto = Orig->castAs<FunctionProtoType>();
	return getFunctionType(
	Proto->getReturnType(), Proto->getParamTypes(),
	Proto->getExtProtoInfo().withExceptionSpec(ESI));
	}

	bool ASTContext::hasSameFunctionTypeIgnoringExceptionSpec(QualType T,
	QualType U) {
	return hasSameType(T, U) \|\|
	(getLangOpts().CPlusPlus17 &&
	hasSameType(getFunctionTypeWithExceptionSpec(T, EST_None),
	getFunctionTypeWithExceptionSpec(U, EST_None)));
	}

	QualType ASTContext::getFunctionTypeWithoutPtrSizes(QualType T) {
	if (const auto *Proto = T->getAs<FunctionProtoType>()) {
	QualType RetTy = removePtrSizeAddrSpace(Proto->getReturnType());
	SmallVector<QualType, 16> Args(Proto->param_types());
	for (unsigned i = 0, n = Args.size(); i != n; ++i)
	Args[i] = removePtrSizeAddrSpace(Args[i]);
	return getFunctionType(RetTy, Args, Proto->getExtProtoInfo());
	}

	if (const FunctionNoProtoType *Proto = T->getAs<FunctionNoProtoType>()) {
	QualType RetTy = removePtrSizeAddrSpace(Proto->getReturnType());
	return getFunctionNoProtoType(RetTy, Proto->getExtInfo());
	}

	return T;
	}

	bool ASTContext::hasSameFunctionTypeIgnoringPtrSizes(QualType T, QualType U) {
	return hasSameType(T, U) \|\|
	hasSameType(getFunctionTypeWithoutPtrSizes(T),
	getFunctionTypeWithoutPtrSizes(U));
	}

	void ASTContext::adjustExceptionSpec(
	FunctionDecl *FD, const FunctionProtoType::ExceptionSpecInfo &ESI,
	bool AsWritten) {
	// Update the type.
	QualType Updated =
	getFunctionTypeWithExceptionSpec(FD->getType(), ESI);
	FD->setType(Updated);

	if (!AsWritten)
	return;

	// Update the type in the type source information too.
	if (TypeSourceInfo *TSInfo = FD->getTypeSourceInfo()) {
	// If the type and the type-as-written differ, we may need to update
	// the type-as-written too.
	if (TSInfo->getType() != FD->getType())
	Updated = getFunctionTypeWithExceptionSpec(TSInfo->getType(), ESI);

	// FIXME: When we get proper type location information for exceptions,
	// we'll also have to rebuild the TypeSourceInfo. For now, we just patch
	// up the TypeSourceInfo;
	assert(TypeLoc::getFullDataSizeForType(Updated) ==
	TypeLoc::getFullDataSizeForType(TSInfo->getType()) &&
	"TypeLoc size mismatch from updating exception specification");
	TSInfo->overrideType(Updated);
	}
	}

	/// getComplexType - Return the uniqued reference to the type for a complex
	/// number with the specified element type.
	QualType ASTContext::getComplexType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	ComplexType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (ComplexType *CT = ComplexTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(CT, 0);

	// If the pointee type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getComplexType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	ComplexType *NewIP = ComplexTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment) ComplexType(T, Canonical);
	Types.push_back(New);
	ComplexTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getPointerType - Return the uniqued reference to the type for a pointer to
	/// the specified type.
	QualType ASTContext::getPointerType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	PointerType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (PointerType *PT = PointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the pointee type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getPointerType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	PointerType *NewIP = PointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment) PointerType(T, Canonical);
	Types.push_back(New);
	PointerTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	QualType ASTContext::getAdjustedType(QualType Orig, QualType New) const {
	llvm::FoldingSetNodeID ID;
	AdjustedType::Profile(ID, Orig, New);
	void *InsertPos = nullptr;
	AdjustedType *AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (AT)
	return QualType(AT, 0);

	QualType Canonical = getCanonicalType(New);

	// Get the new insert position for the node we care about.
	AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!AT && "Shouldn't be in the map!");

	AT = new (*this, TypeAlignment)
	AdjustedType(Type::Adjusted, Orig, New, Canonical);
	Types.push_back(AT);
	AdjustedTypes.InsertNode(AT, InsertPos);
	return QualType(AT, 0);
	}

	QualType ASTContext::getDecayedType(QualType T) const {
	assert((T->isArrayType() \|\| T->isFunctionType()) && "T does not decay");

	QualType Decayed;

	// C99 6.7.5.3p7:
	// A declaration of a parameter as "array of type" shall be
	// adjusted to "qualified pointer to type", where the type
	// qualifiers (if any) are those specified within the [ and ] of
	// the array type derivation.
	if (T->isArrayType())
	Decayed = getArrayDecayedType(T);

	// C99 6.7.5.3p8:
	// A declaration of a parameter as "function returning type"
	// shall be adjusted to "pointer to function returning type", as
	// in 6.3.2.1.
	if (T->isFunctionType())
	Decayed = getPointerType(T);

	llvm::FoldingSetNodeID ID;
	AdjustedType::Profile(ID, T, Decayed);
	void *InsertPos = nullptr;
	AdjustedType *AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (AT)
	return QualType(AT, 0);

	QualType Canonical = getCanonicalType(Decayed);

	// Get the new insert position for the node we care about.
	AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!AT && "Shouldn't be in the map!");

	AT = new (*this, TypeAlignment) DecayedType(T, Decayed, Canonical);
	Types.push_back(AT);
	AdjustedTypes.InsertNode(AT, InsertPos);
	return QualType(AT, 0);
	}

	/// getBlockPointerType - Return the uniqued reference to the type for
	/// a pointer to the specified block.
	QualType ASTContext::getBlockPointerType(QualType T) const {
	assert(T->isFunctionType() && "block of function types only");
	// Unique pointers, to guarantee there is only one block of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	BlockPointerType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (BlockPointerType *PT =
	BlockPointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the block pointee type isn't canonical, this won't be a canonical
	// type either so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getBlockPointerType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	BlockPointerType *NewIP =
	BlockPointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment) BlockPointerType(T, Canonical);
	Types.push_back(New);
	BlockPointerTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getLValueReferenceType - Return the uniqued reference to the type for an
	/// lvalue reference to the specified type.
	QualType
	ASTContext::getLValueReferenceType(QualType T, bool SpelledAsLValue) const {
	assert(getCanonicalType(T) != OverloadTy &&
	"Unresolved overloaded function type");

	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	ReferenceType::Profile(ID, T, SpelledAsLValue);

	void *InsertPos = nullptr;
	if (LValueReferenceType *RT =
	LValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(RT, 0);

	const auto *InnerRef = T->getAs<ReferenceType>();

	// If the referencee type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (!SpelledAsLValue \|\| InnerRef \|\| !T.isCanonical()) {
	QualType PointeeType = (InnerRef ? InnerRef->getPointeeType() : T);
	Canonical = getLValueReferenceType(getCanonicalType(PointeeType));

	// Get the new insert position for the node we care about.
	LValueReferenceType *NewIP =
	LValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	auto New = new (this, TypeAlignment) LValueReferenceType(T, Canonical,
	SpelledAsLValue);
	Types.push_back(New);
	LValueReferenceTypes.InsertNode(New, InsertPos);

	return QualType(New, 0);
	}

	/// getRValueReferenceType - Return the uniqued reference to the type for an
	/// rvalue reference to the specified type.
	QualType ASTContext::getRValueReferenceType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	ReferenceType::Profile(ID, T, false);

	void *InsertPos = nullptr;
	if (RValueReferenceType *RT =
	RValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(RT, 0);

	const auto *InnerRef = T->getAs<ReferenceType>();

	// If the referencee type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (InnerRef \|\| !T.isCanonical()) {
	QualType PointeeType = (InnerRef ? InnerRef->getPointeeType() : T);
	Canonical = getRValueReferenceType(getCanonicalType(PointeeType));

	// Get the new insert position for the node we care about.
	RValueReferenceType *NewIP =
	RValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	auto New = new (this, TypeAlignment) RValueReferenceType(T, Canonical);
	Types.push_back(New);
	RValueReferenceTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getMemberPointerType - Return the uniqued reference to the type for a
	/// member pointer to the specified type, in the specified class.
	QualType ASTContext::getMemberPointerType(QualType T, const Type *Cls) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	MemberPointerType::Profile(ID, T, Cls);

	void *InsertPos = nullptr;
	if (MemberPointerType *PT =
	MemberPointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the pointee or class type isn't canonical, this won't be a canonical
	// type either, so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical() \|\| !Cls->isCanonicalUnqualified()) {
	Canonical = getMemberPointerType(getCanonicalType(T),getCanonicalType(Cls));

	// Get the new insert position for the node we care about.
	MemberPointerType *NewIP =
	MemberPointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment) MemberPointerType(T, Cls, Canonical);
	Types.push_back(New);
	MemberPointerTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getConstantArrayType - Return the unique reference to the type for an
	/// array of the specified element type.
	QualType ASTContext::getConstantArrayType(QualType EltTy,
	const llvm::APInt &ArySizeIn,
	const Expr *SizeExpr,
	ArrayType::ArraySizeModifier ASM,
	unsigned IndexTypeQuals) const {
	assert((EltTy->isDependentType() \|\|
	EltTy->isIncompleteType() \|\| EltTy->isConstantSizeType()) &&
	"Constant array of VLAs is illegal!");

	// We only need the size as part of the type if it's instantiation-dependent.
	if (SizeExpr && !SizeExpr->isInstantiationDependent())
	SizeExpr = nullptr;

	// Convert the array size into a canonical width matching the pointer size for
	// the target.
	llvm::APInt ArySize(ArySizeIn);
	ArySize = ArySize.zextOrTrunc(Target->getMaxPointerWidth());

	llvm::FoldingSetNodeID ID;
	ConstantArrayType::Profile(ID, *this, EltTy, ArySize, SizeExpr, ASM,
	IndexTypeQuals);

	void *InsertPos = nullptr;
	if (ConstantArrayType *ATP =
	ConstantArrayTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(ATP, 0);

	// If the element type isn't canonical or has qualifiers, or the array bound
	// is instantiation-dependent, this won't be a canonical type either, so fill
	// in the canonical type field.
	QualType Canon;
	if (!EltTy.isCanonical() \|\| EltTy.hasLocalQualifiers() \|\| SizeExpr) {
	SplitQualType canonSplit = getCanonicalType(EltTy).split();
	Canon = getConstantArrayType(QualType(canonSplit.Ty, 0), ArySize, nullptr,
	ASM, IndexTypeQuals);
	Canon = getQualifiedType(Canon, canonSplit.Quals);

	// Get the new insert position for the node we care about.
	ConstantArrayType *NewIP =
	ConstantArrayTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	void *Mem = Allocate(
	ConstantArrayType::totalSizeToAlloc<const Expr *>(SizeExpr ? 1 : 0),
	TypeAlignment);
	auto *New = new (Mem)
	ConstantArrayType(EltTy, Canon, ArySize, SizeExpr, ASM, IndexTypeQuals);
	ConstantArrayTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	/// getVariableArrayDecayedType - Turns the given type, which may be
	/// variably-modified, into the corresponding type with all the known
	/// sizes replaced with [*].
	QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
	// Vastly most common case.
	if (!type->isVariablyModifiedType()) return type;

	QualType result;

	SplitQualType split = type.getSplitDesugaredType();
	const Type *ty = split.Ty;
	switch (ty->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
	#include "clang/AST/TypeNodes.inc"
	llvm_unreachable("didn't desugar past all non-canonical types?");

	// These types should never be variably-modified.
	case Type::Builtin:
	case Type::Complex:
	case Type::Vector:
	case Type::DependentVector:
	case Type::ExtVector:
	case Type::DependentSizedExtVector:
	case Type::ConstantMatrix:
	case Type::DependentSizedMatrix:
	case Type::DependentAddressSpace:
	case Type::ObjCObject:
	case Type::ObjCInterface:
	case Type::ObjCObjectPointer:
	case Type::Record:
	case Type::Enum:
	case Type::UnresolvedUsing:
	case Type::TypeOfExpr:
	case Type::TypeOf:
	case Type::Decltype:
	case Type::UnaryTransform:
	case Type::DependentName:
	case Type::InjectedClassName:
	case Type::TemplateSpecialization:
	case Type::DependentTemplateSpecialization:
	case Type::TemplateTypeParm:
	case Type::SubstTemplateTypeParmPack:
	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	case Type::PackExpansion:
	case Type::ExtInt:
	case Type::DependentExtInt:
	llvm_unreachable("type should never be variably-modified");

	// These types can be variably-modified but should never need to
	// further decay.
	case Type::FunctionNoProto:
	case Type::FunctionProto:
	case Type::BlockPointer:
	case Type::MemberPointer:
	case Type::Pipe:
	return type;

	// These types can be variably-modified. All these modifications
	// preserve structure except as noted by comments.
	// TODO: if we ever care about optimizing VLAs, there are no-op
	// optimizations available here.
	case Type::Pointer:
	result = getPointerType(getVariableArrayDecayedType(
	cast<PointerType>(ty)->getPointeeType()));
	break;

	case Type::LValueReference: {
	const auto *lv = cast<LValueReferenceType>(ty);
	result = getLValueReferenceType(
	getVariableArrayDecayedType(lv->getPointeeType()),
	lv->isSpelledAsLValue());
	break;
	}

	case Type::RValueReference: {
	const auto *lv = cast<RValueReferenceType>(ty);
	result = getRValueReferenceType(
	getVariableArrayDecayedType(lv->getPointeeType()));
	break;
	}

	case Type::Atomic: {
	const auto *at = cast<AtomicType>(ty);
	result = getAtomicType(getVariableArrayDecayedType(at->getValueType()));
	break;
	}

	case Type::ConstantArray: {
	const auto *cat = cast<ConstantArrayType>(ty);
	result = getConstantArrayType(
	getVariableArrayDecayedType(cat->getElementType()),
	cat->getSize(),
	cat->getSizeExpr(),
	cat->getSizeModifier(),
	cat->getIndexTypeCVRQualifiers());
	break;
	}

	case Type::DependentSizedArray: {
	const auto *dat = cast<DependentSizedArrayType>(ty);
	result = getDependentSizedArrayType(
	getVariableArrayDecayedType(dat->getElementType()),
	dat->getSizeExpr(),
	dat->getSizeModifier(),
	dat->getIndexTypeCVRQualifiers(),
	dat->getBracketsRange());
	break;
	}

	// Turn incomplete types into [*] types.
	case Type::IncompleteArray: {
	const auto *iat = cast<IncompleteArrayType>(ty);
	result = getVariableArrayType(
	getVariableArrayDecayedType(iat->getElementType()),
	/size/ nullptr,
	ArrayType::Normal,
	iat->getIndexTypeCVRQualifiers(),
	SourceRange());
	break;
	}

	// Turn VLA types into [*] types.
	case Type::VariableArray: {
	const auto *vat = cast<VariableArrayType>(ty);
	result = getVariableArrayType(
	getVariableArrayDecayedType(vat->getElementType()),
	/size/ nullptr,
	ArrayType::Star,
	vat->getIndexTypeCVRQualifiers(),
	vat->getBracketsRange());
	break;
	}
	}

	// Apply the top-level qualifiers from the original.
	return getQualifiedType(result, split.Quals);
	}

	/// getVariableArrayType - Returns a non-unique reference to the type for a
	/// variable array of the specified element type.
	QualType ASTContext::getVariableArrayType(QualType EltTy,
	Expr *NumElts,
	ArrayType::ArraySizeModifier ASM,
	unsigned IndexTypeQuals,
	SourceRange Brackets) const {
	// Since we don't unique expressions, it isn't possible to unique VLA's
	// that have an expression provided for their size.
	QualType Canon;

	// Be sure to pull qualifiers off the element type.
	if (!EltTy.isCanonical() \|\| EltTy.hasLocalQualifiers()) {
	SplitQualType canonSplit = getCanonicalType(EltTy).split();
	Canon = getVariableArrayType(QualType(canonSplit.Ty, 0), NumElts, ASM,
	IndexTypeQuals, Brackets);
	Canon = getQualifiedType(Canon, canonSplit.Quals);
	}

	auto New = new (this, TypeAlignment)
	VariableArrayType(EltTy, Canon, NumElts, ASM, IndexTypeQuals, Brackets);

	VariableArrayTypes.push_back(New);
	Types.push_back(New);
	return QualType(New, 0);
	}

	/// getDependentSizedArrayType - Returns a non-unique reference to
	/// the type for a dependently-sized array of the specified element
	/// type.
	QualType ASTContext::getDependentSizedArrayType(QualType elementType,
	Expr *numElements,
	ArrayType::ArraySizeModifier ASM,
	unsigned elementTypeQuals,
	SourceRange brackets) const {
	assert((!numElements \|\| numElements->isTypeDependent() \|\|
	numElements->isValueDependent()) &&
	"Size must be type- or value-dependent!");

	// Dependently-sized array types that do not have a specified number
	// of elements will have their sizes deduced from a dependent
	// initializer. We do no canonicalization here at all, which is okay
	// because they can't be used in most locations.
	if (!numElements) {
	auto *newType
	= new (*this, TypeAlignment)
	DependentSizedArrayType(*this, elementType, QualType(),
	numElements, ASM, elementTypeQuals,
	brackets);
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	// Otherwise, we actually build a new type every time, but we
	// also build a canonical type.

	SplitQualType canonElementType = getCanonicalType(elementType).split();

	void *insertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	DependentSizedArrayType::Profile(ID, *this,
	QualType(canonElementType.Ty, 0),
	ASM, elementTypeQuals, numElements);

	// Look for an existing type with these properties.
	DependentSizedArrayType *canonTy =
	DependentSizedArrayTypes.FindNodeOrInsertPos(ID, insertPos);

	// If we don't have one, build one.
	if (!canonTy) {
	canonTy = new (*this, TypeAlignment)
	DependentSizedArrayType(*this, QualType(canonElementType.Ty, 0),
	QualType(), numElements, ASM, elementTypeQuals,
	brackets);
	DependentSizedArrayTypes.InsertNode(canonTy, insertPos);
	Types.push_back(canonTy);
	}

	// Apply qualifiers from the element type to the array.
	QualType canon = getQualifiedType(QualType(canonTy,0),
	canonElementType.Quals);

	// If we didn't need extra canonicalization for the element type or the size
	// expression, then just use that as our result.
	if (QualType(canonElementType.Ty, 0) == elementType &&
	canonTy->getSizeExpr() == numElements)
	return canon;

	// Otherwise, we need to build a type which follows the spelling
	// of the element type.
	auto *sugaredType
	= new (*this, TypeAlignment)
	DependentSizedArrayType(*this, elementType, canon, numElements,
	ASM, elementTypeQuals, brackets);
	Types.push_back(sugaredType);
	return QualType(sugaredType, 0);
	}

	QualType ASTContext::getIncompleteArrayType(QualType elementType,
	ArrayType::ArraySizeModifier ASM,
	unsigned elementTypeQuals) const {
	llvm::FoldingSetNodeID ID;
	IncompleteArrayType::Profile(ID, elementType, ASM, elementTypeQuals);

	void *insertPos = nullptr;
	if (IncompleteArrayType *iat =
	IncompleteArrayTypes.FindNodeOrInsertPos(ID, insertPos))
	return QualType(iat, 0);

	// If the element type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field. We also have to pull
	// qualifiers off the element type.
	QualType canon;

	if (!elementType.isCanonical() \|\| elementType.hasLocalQualifiers()) {
	SplitQualType canonSplit = getCanonicalType(elementType).split();
	canon = getIncompleteArrayType(QualType(canonSplit.Ty, 0),
	ASM, elementTypeQuals);
	canon = getQualifiedType(canon, canonSplit.Quals);

	// Get the new insert position for the node we care about.
	IncompleteArrayType *existing =
	IncompleteArrayTypes.FindNodeOrInsertPos(ID, insertPos);
	assert(!existing && "Shouldn't be in the map!"); (void) existing;
	}

	auto newType = new (this, TypeAlignment)
	IncompleteArrayType(elementType, canon, ASM, elementTypeQuals);

	IncompleteArrayTypes.InsertNode(newType, insertPos);
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	/// getScalableVectorType - Return the unique reference to a scalable vector
	/// type of the specified element type and size. VectorType must be a built-in
	/// type.
	QualType ASTContext::getScalableVectorType(QualType EltTy,
	unsigned NumElts) const {
	if (Target->hasAArch64SVETypes()) {
	uint64_t EltTySize = getTypeSize(EltTy);
	#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits, \
	IsSigned, IsFP, IsBF) \
	if (!EltTy->isBooleanType() && \
	((EltTy->hasIntegerRepresentation() && \
	EltTy->hasSignedIntegerRepresentation() == IsSigned) \|\| \
	(EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() && \
	IsFP && !IsBF) \|\| \
	(EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() && \
	IsBF && !IsFP)) && \
	EltTySize == ElBits && NumElts == NumEls) { \
	return SingletonId; \
	}
	#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls) \
	if (EltTy->isBooleanType() && NumElts == NumEls) \
	return SingletonId;
	#include "clang/Basic/AArch64SVEACLETypes.def"
	}
	return QualType();
	}

	/// getVectorType - Return the unique reference to a vector type of
	/// the specified element type and size. VectorType must be a built-in type.
	QualType ASTContext::getVectorType(QualType vecType, unsigned NumElts,
	VectorType::VectorKind VecKind) const {
	assert(vecType->isBuiltinType());

	// Check if we've already instantiated a vector of this type.
	llvm::FoldingSetNodeID ID;
	VectorType::Profile(ID, vecType, NumElts, Type::Vector, VecKind);

	void *InsertPos = nullptr;
	if (VectorType *VTP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(VTP, 0);

	// If the element type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!vecType.isCanonical()) {
	Canonical = getVectorType(getCanonicalType(vecType), NumElts, VecKind);

	// Get the new insert position for the node we care about.
	VectorType *NewIP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment)
	VectorType(vecType, NumElts, Canonical, VecKind);
	VectorTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType
	ASTContext::getDependentVectorType(QualType VecType, Expr *SizeExpr,
	SourceLocation AttrLoc,
	VectorType::VectorKind VecKind) const {
	llvm::FoldingSetNodeID ID;
	DependentVectorType::Profile(ID, *this, getCanonicalType(VecType), SizeExpr,
	VecKind);
	void *InsertPos = nullptr;
	DependentVectorType *Canon =
	DependentVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	DependentVectorType *New;

	if (Canon) {
	New = new (*this, TypeAlignment) DependentVectorType(
	*this, VecType, QualType(Canon, 0), SizeExpr, AttrLoc, VecKind);
	} else {
	QualType CanonVecTy = getCanonicalType(VecType);
	if (CanonVecTy == VecType) {
	New = new (*this, TypeAlignment) DependentVectorType(
	*this, VecType, QualType(), SizeExpr, AttrLoc, VecKind);

	DependentVectorType *CanonCheck =
	DependentVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CanonCheck &&
	"Dependent-sized vector_size canonical type broken");
	(void)CanonCheck;
	DependentVectorTypes.InsertNode(New, InsertPos);
	} else {
	QualType CanonTy = getDependentVectorType(CanonVecTy, SizeExpr,
	SourceLocation(), VecKind);
	New = new (*this, TypeAlignment) DependentVectorType(
	*this, VecType, CanonTy, SizeExpr, AttrLoc, VecKind);
	}
	}

	Types.push_back(New);
	return QualType(New, 0);
	}

	/// getExtVectorType - Return the unique reference to an extended vector type of
	/// the specified element type and size. VectorType must be a built-in type.
	QualType
	ASTContext::getExtVectorType(QualType vecType, unsigned NumElts) const {
	assert(vecType->isBuiltinType() \|\| vecType->isDependentType());

	// Check if we've already instantiated a vector of this type.
	llvm::FoldingSetNodeID ID;
	VectorType::Profile(ID, vecType, NumElts, Type::ExtVector,
	VectorType::GenericVector);
	void *InsertPos = nullptr;
	if (VectorType *VTP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(VTP, 0);

	// If the element type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!vecType.isCanonical()) {
	Canonical = getExtVectorType(getCanonicalType(vecType), NumElts);

	// Get the new insert position for the node we care about.
	VectorType *NewIP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment)
	ExtVectorType(vecType, NumElts, Canonical);
	VectorTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType
	ASTContext::getDependentSizedExtVectorType(QualType vecType,
	Expr *SizeExpr,
	SourceLocation AttrLoc) const {
	llvm::FoldingSetNodeID ID;
	DependentSizedExtVectorType::Profile(ID, *this, getCanonicalType(vecType),
	SizeExpr);

	void *InsertPos = nullptr;
	DependentSizedExtVectorType *Canon
	= DependentSizedExtVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	DependentSizedExtVectorType *New;
	if (Canon) {
	// We already have a canonical version of this array type; use it as
	// the canonical type for a newly-built type.
	New = new (*this, TypeAlignment)
	DependentSizedExtVectorType(*this, vecType, QualType(Canon, 0),
	SizeExpr, AttrLoc);
	} else {
	QualType CanonVecTy = getCanonicalType(vecType);
	if (CanonVecTy == vecType) {
	New = new (*this, TypeAlignment)
	DependentSizedExtVectorType(*this, vecType, QualType(), SizeExpr,
	AttrLoc);

	DependentSizedExtVectorType *CanonCheck
	= DependentSizedExtVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CanonCheck && "Dependent-sized ext_vector canonical type broken");
	(void)CanonCheck;
	DependentSizedExtVectorTypes.InsertNode(New, InsertPos);
	} else {
	QualType CanonExtTy = getDependentSizedExtVectorType(CanonVecTy, SizeExpr,
	SourceLocation());
	New = new (*this, TypeAlignment) DependentSizedExtVectorType(
	*this, vecType, CanonExtTy, SizeExpr, AttrLoc);
	}
	}

	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType ASTContext::getConstantMatrixType(QualType ElementTy, unsigned NumRows,
	unsigned NumColumns) const {
	llvm::FoldingSetNodeID ID;
	ConstantMatrixType::Profile(ID, ElementTy, NumRows, NumColumns,
	Type::ConstantMatrix);

	assert(MatrixType::isValidElementType(ElementTy) &&
	"need a valid element type");
	assert(ConstantMatrixType::isDimensionValid(NumRows) &&
	ConstantMatrixType::isDimensionValid(NumColumns) &&
	"need valid matrix dimensions");
	void *InsertPos = nullptr;
	if (ConstantMatrixType *MTP = MatrixTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(MTP, 0);

	QualType Canonical;
	if (!ElementTy.isCanonical()) {
	Canonical =
	getConstantMatrixType(getCanonicalType(ElementTy), NumRows, NumColumns);

	ConstantMatrixType *NewIP = MatrixTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Matrix type shouldn't already exist in the map");
	(void)NewIP;
	}

	auto New = new (this, TypeAlignment)
	ConstantMatrixType(ElementTy, NumRows, NumColumns, Canonical);
	MatrixTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType ASTContext::getDependentSizedMatrixType(QualType ElementTy,
	Expr *RowExpr,
	Expr *ColumnExpr,
	SourceLocation AttrLoc) const {
	QualType CanonElementTy = getCanonicalType(ElementTy);
	llvm::FoldingSetNodeID ID;
	DependentSizedMatrixType::Profile(ID, *this, CanonElementTy, RowExpr,
	ColumnExpr);

	void *InsertPos = nullptr;
	DependentSizedMatrixType *Canon =
	DependentSizedMatrixTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!Canon) {
	Canon = new (*this, TypeAlignment) DependentSizedMatrixType(
	*this, CanonElementTy, QualType(), RowExpr, ColumnExpr, AttrLoc);
	#ifndef NDEBUG
	DependentSizedMatrixType *CanonCheck =
	DependentSizedMatrixTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CanonCheck && "Dependent-sized matrix canonical type broken");
	#endif
	DependentSizedMatrixTypes.InsertNode(Canon, InsertPos);
	Types.push_back(Canon);
	}

	// Already have a canonical version of the matrix type
	//
	// If it exactly matches the requested type, use it directly.
	if (Canon->getElementType() == ElementTy && Canon->getRowExpr() == RowExpr &&
	Canon->getRowExpr() == ColumnExpr)
	return QualType(Canon, 0);

	// Use Canon as the canonical type for newly-built type.
	DependentSizedMatrixType New = new (this, TypeAlignment)
	DependentSizedMatrixType(*this, ElementTy, QualType(Canon, 0), RowExpr,
	ColumnExpr, AttrLoc);
	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType ASTContext::getDependentAddressSpaceType(QualType PointeeType,
	Expr *AddrSpaceExpr,
	SourceLocation AttrLoc) const {
	assert(AddrSpaceExpr->isInstantiationDependent());

	QualType canonPointeeType = getCanonicalType(PointeeType);

	void *insertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	DependentAddressSpaceType::Profile(ID, *this, canonPointeeType,
	AddrSpaceExpr);

	DependentAddressSpaceType *canonTy =
	DependentAddressSpaceTypes.FindNodeOrInsertPos(ID, insertPos);

	if (!canonTy) {
	canonTy = new (*this, TypeAlignment)
	DependentAddressSpaceType(*this, canonPointeeType,
	QualType(), AddrSpaceExpr, AttrLoc);
	DependentAddressSpaceTypes.InsertNode(canonTy, insertPos);
	Types.push_back(canonTy);
	}

	if (canonPointeeType == PointeeType &&
	canonTy->getAddrSpaceExpr() == AddrSpaceExpr)
	return QualType(canonTy, 0);

	auto *sugaredType
	= new (*this, TypeAlignment)
	DependentAddressSpaceType(*this, PointeeType, QualType(canonTy, 0),
	AddrSpaceExpr, AttrLoc);
	Types.push_back(sugaredType);
	return QualType(sugaredType, 0);
	}

	/// Determine whether \p T is canonical as the result type of a function.
	static bool isCanonicalResultType(QualType T) {
	return T.isCanonical() &&
	(T.getObjCLifetime() == Qualifiers::OCL_None \|\|
	T.getObjCLifetime() == Qualifiers::OCL_ExplicitNone);
	}

	/// getFunctionNoProtoType - Return a K&R style C function type like 'int()'.
	QualType
	ASTContext::getFunctionNoProtoType(QualType ResultTy,
	const FunctionType::ExtInfo &Info) const {
	// Unique functions, to guarantee there is only one function of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	FunctionNoProtoType::Profile(ID, ResultTy, Info);

	void *InsertPos = nullptr;
	if (FunctionNoProtoType *FT =
	FunctionNoProtoTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(FT, 0);

	QualType Canonical;
	if (!isCanonicalResultType(ResultTy)) {
	Canonical =
	getFunctionNoProtoType(getCanonicalFunctionResultType(ResultTy), Info);

	// Get the new insert position for the node we care about.
	FunctionNoProtoType *NewIP =
	FunctionNoProtoTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	auto New = new (this, TypeAlignment)
	FunctionNoProtoType(ResultTy, Canonical, Info);
	Types.push_back(New);
	FunctionNoProtoTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	CanQualType
	ASTContext::getCanonicalFunctionResultType(QualType ResultType) const {
	CanQualType CanResultType = getCanonicalType(ResultType);

	// Canonical result types do not have ARC lifetime qualifiers.
	if (CanResultType.getQualifiers().hasObjCLifetime()) {
	Qualifiers Qs = CanResultType.getQualifiers();
	Qs.removeObjCLifetime();
	return CanQualType::CreateUnsafe(
	getQualifiedType(CanResultType.getUnqualifiedType(), Qs));
	}

	return CanResultType;
	}

	static bool isCanonicalExceptionSpecification(
	const FunctionProtoType::ExceptionSpecInfo &ESI, bool NoexceptInType) {
	if (ESI.Type == EST_None)
	return true;
	if (!NoexceptInType)
	return false;

	// C++17 onwards: exception specification is part of the type, as a simple
	// boolean "can this function type throw".
	if (ESI.Type == EST_BasicNoexcept)
	return true;

	// A noexcept(expr) specification is (possibly) canonical if expr is
	// value-dependent.
	if (ESI.Type == EST_DependentNoexcept)
	return true;

	// A dynamic exception specification is canonical if it only contains pack
	// expansions (so we can't tell whether it's non-throwing) and all its
	// contained types are canonical.
	if (ESI.Type == EST_Dynamic) {
	bool AnyPackExpansions = false;
	for (QualType ET : ESI.Exceptions) {
	if (!ET.isCanonical())
	return false;
	if (ET->getAs<PackExpansionType>())
	AnyPackExpansions = true;
	}
	return AnyPackExpansions;
	}

	return false;
	}

	QualType ASTContext::getFunctionTypeInternal(
	QualType ResultTy, ArrayRef<QualType> ArgArray,
	const FunctionProtoType::ExtProtoInfo &EPI, bool OnlyWantCanonical) const {
	size_t NumArgs = ArgArray.size();

	// Unique functions, to guarantee there is only one function of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	FunctionProtoType::Profile(ID, ResultTy, ArgArray.begin(), NumArgs, EPI,
	*this, true);

	QualType Canonical;
	bool Unique = false;

	void *InsertPos = nullptr;
	if (FunctionProtoType *FPT =
	FunctionProtoTypes.FindNodeOrInsertPos(ID, InsertPos)) {
	QualType Existing = QualType(FPT, 0);

	// If we find a pre-existing equivalent FunctionProtoType, we can just reuse
	// it so long as our exception specification doesn't contain a dependent
	// noexcept expression, or we're just looking for a canonical type.
	// Otherwise, we're going to need to create a type
	// sugar node to hold the concrete expression.
	if (OnlyWantCanonical \|\| !isComputedNoexcept(EPI.ExceptionSpec.Type) \|\|
	EPI.ExceptionSpec.NoexceptExpr == FPT->getNoexceptExpr())
	return Existing;

	// We need a new type sugar node for this one, to hold the new noexcept
	// expression. We do no canonicalization here, but that's OK since we don't
	// expect to see the same noexcept expression much more than once.
	Canonical = getCanonicalType(Existing);
	Unique = true;
	}

	bool NoexceptInType = getLangOpts().CPlusPlus17;
	bool IsCanonicalExceptionSpec =
	isCanonicalExceptionSpecification(EPI.ExceptionSpec, NoexceptInType);

	// Determine whether the type being created is already canonical or not.
	bool isCanonical = !Unique && IsCanonicalExceptionSpec &&
	isCanonicalResultType(ResultTy) && !EPI.HasTrailingReturn;
	for (unsigned i = 0; i != NumArgs && isCanonical; ++i)
	if (!ArgArray[i].isCanonicalAsParam())
	isCanonical = false;

	if (OnlyWantCanonical)
	assert(isCanonical &&
	"given non-canonical parameters constructing canonical type");

	// If this type isn't canonical, get the canonical version of it if we don't
	// already have it. The exception spec is only partially part of the
	// canonical type, and only in C++17 onwards.
	if (!isCanonical && Canonical.isNull()) {
	SmallVector<QualType, 16> CanonicalArgs;
	CanonicalArgs.reserve(NumArgs);
	for (unsigned i = 0; i != NumArgs; ++i)
	CanonicalArgs.push_back(getCanonicalParamType(ArgArray[i]));

	llvm::SmallVector<QualType, 8> ExceptionTypeStorage;
	FunctionProtoType::ExtProtoInfo CanonicalEPI = EPI;
	CanonicalEPI.HasTrailingReturn = false;

	if (IsCanonicalExceptionSpec) {
	// Exception spec is already OK.
	} else if (NoexceptInType) {
	switch (EPI.ExceptionSpec.Type) {
	case EST_Unparsed: case EST_Unevaluated: case EST_Uninstantiated:
	// We don't know yet. It shouldn't matter what we pick here; no-one
	// should ever look at this.
	LLVM_FALLTHROUGH;
	case EST_None: case EST_MSAny: case EST_NoexceptFalse:
	CanonicalEPI.ExceptionSpec.Type = EST_None;
	break;

	// A dynamic exception specification is almost always "not noexcept",
	// with the exception that a pack expansion might expand to no types.
	case EST_Dynamic: {
	bool AnyPacks = false;
	for (QualType ET : EPI.ExceptionSpec.Exceptions) {
	if (ET->getAs<PackExpansionType>())
	AnyPacks = true;
	ExceptionTypeStorage.push_back(getCanonicalType(ET));
	}
	if (!AnyPacks)
	CanonicalEPI.ExceptionSpec.Type = EST_None;
	else {
	CanonicalEPI.ExceptionSpec.Type = EST_Dynamic;
	CanonicalEPI.ExceptionSpec.Exceptions = ExceptionTypeStorage;
	}
	break;
	}

	case EST_DynamicNone:
	case EST_BasicNoexcept:
	case EST_NoexceptTrue:
	case EST_NoThrow:
	CanonicalEPI.ExceptionSpec.Type = EST_BasicNoexcept;
	break;

	case EST_DependentNoexcept:
	llvm_unreachable("dependent noexcept is already canonical");
	}
	} else {
	CanonicalEPI.ExceptionSpec = FunctionProtoType::ExceptionSpecInfo();
	}

	// Adjust the canonical function result type.
	CanQualType CanResultTy = getCanonicalFunctionResultType(ResultTy);
	Canonical =
	getFunctionTypeInternal(CanResultTy, CanonicalArgs, CanonicalEPI, true);

	// Get the new insert position for the node we care about.
	FunctionProtoType *NewIP =
	FunctionProtoTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	// Compute the needed size to hold this FunctionProtoType and the
	// various trailing objects.
	auto ESH = FunctionProtoType::getExceptionSpecSize(
	EPI.ExceptionSpec.Type, EPI.ExceptionSpec.Exceptions.size());
	size_t Size = FunctionProtoType::totalSizeToAlloc<
	QualType, SourceLocation, FunctionType::FunctionTypeExtraBitfields,
	FunctionType::ExceptionType, Expr , FunctionDecl ,
	FunctionProtoType::ExtParameterInfo, Qualifiers>(
	NumArgs, EPI.Variadic,
	FunctionProtoType::hasExtraBitfields(EPI.ExceptionSpec.Type),
	ESH.NumExceptionType, ESH.NumExprPtr, ESH.NumFunctionDeclPtr,
	EPI.ExtParameterInfos ? NumArgs : 0,
	EPI.TypeQuals.hasNonFastQualifiers() ? 1 : 0);

	auto FTP = (FunctionProtoType )Allocate(Size, TypeAlignment);
	FunctionProtoType::ExtProtoInfo newEPI = EPI;
	new (FTP) FunctionProtoType(ResultTy, ArgArray, Canonical, newEPI);
	Types.push_back(FTP);
	if (!Unique)
	FunctionProtoTypes.InsertNode(FTP, InsertPos);
	return QualType(FTP, 0);
	}

	QualType ASTContext::getPipeType(QualType T, bool ReadOnly) const {
	llvm::FoldingSetNodeID ID;
	PipeType::Profile(ID, T, ReadOnly);

	void *InsertPos = nullptr;
	if (PipeType *PT = PipeTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the pipe element type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getPipeType(getCanonicalType(T), ReadOnly);

	// Get the new insert position for the node we care about.
	PipeType *NewIP = PipeTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!");
	(void)NewIP;
	}
	auto New = new (this, TypeAlignment) PipeType(T, Canonical, ReadOnly);
	Types.push_back(New);
	PipeTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	QualType ASTContext::adjustStringLiteralBaseType(QualType Ty) const {
	// OpenCL v1.1 s6.5.3: a string literal is in the constant address space.
	return LangOpts.OpenCL ? getAddrSpaceQualType(Ty, LangAS::opencl_constant)
	: Ty;
	}

	QualType ASTContext::getReadPipeType(QualType T) const {
	return getPipeType(T, true);
	}

	QualType ASTContext::getWritePipeType(QualType T) const {
	return getPipeType(T, false);
	}

	QualType ASTContext::getExtIntType(bool IsUnsigned, unsigned NumBits) const {
	llvm::FoldingSetNodeID ID;
	ExtIntType::Profile(ID, IsUnsigned, NumBits);

	void *InsertPos = nullptr;
	if (ExtIntType *EIT = ExtIntTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(EIT, 0);

	auto New = new (this, TypeAlignment) ExtIntType(IsUnsigned, NumBits);
	ExtIntTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType ASTContext::getDependentExtIntType(bool IsUnsigned,
	Expr *NumBitsExpr) const {
	assert(NumBitsExpr->isInstantiationDependent() && "Only good for dependent");
	llvm::FoldingSetNodeID ID;
	DependentExtIntType::Profile(ID, *this, IsUnsigned, NumBitsExpr);

	void *InsertPos = nullptr;
	if (DependentExtIntType *Existing =
	DependentExtIntTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(Existing, 0);

	auto New = new (this, TypeAlignment)
	DependentExtIntType(*this, IsUnsigned, NumBitsExpr);
	DependentExtIntTypes.InsertNode(New, InsertPos);

	Types.push_back(New);
	return QualType(New, 0);
	}

	#ifndef NDEBUG
	static bool NeedsInjectedClassNameType(const RecordDecl *D) {
	if (!isa<CXXRecordDecl>(D)) return false;
	const auto *RD = cast<CXXRecordDecl>(D);
	if (isa<ClassTemplatePartialSpecializationDecl>(RD))
	return true;
	if (RD->getDescribedClassTemplate() &&
	!isa<ClassTemplateSpecializationDecl>(RD))
	return true;
	return false;
	}
	#endif

	/// getInjectedClassNameType - Return the unique reference to the
	/// injected class name type for the specified templated declaration.
	QualType ASTContext::getInjectedClassNameType(CXXRecordDecl *Decl,
	QualType TST) const {
	assert(NeedsInjectedClassNameType(Decl));
	if (Decl->TypeForDecl) {
	assert(isa<InjectedClassNameType>(Decl->TypeForDecl));
	} else if (CXXRecordDecl *PrevDecl = Decl->getPreviousDecl()) {
	assert(PrevDecl->TypeForDecl && "previous declaration has no type");
	Decl->TypeForDecl = PrevDecl->TypeForDecl;
	assert(isa<InjectedClassNameType>(Decl->TypeForDecl));
	} else {
	Type *newType =
	new (*this, TypeAlignment) InjectedClassNameType(Decl, TST);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	}
	return QualType(Decl->TypeForDecl, 0);
	}

	/// getTypeDeclType - Return the unique reference to the type for the
	/// specified type declaration.
	QualType ASTContext::getTypeDeclTypeSlow(const TypeDecl *Decl) const {
	assert(Decl && "Passed null for Decl param");
	assert(!Decl->TypeForDecl && "TypeForDecl present in slow case");

	if (const auto *Typedef = dyn_cast<TypedefNameDecl>(Decl))
	return getTypedefType(Typedef);

	assert(!isa<TemplateTypeParmDecl>(Decl) &&
	"Template type parameter types are always available.");

	if (const auto *Record = dyn_cast<RecordDecl>(Decl)) {
	assert(Record->isFirstDecl() && "struct/union has previous declaration");
	assert(!NeedsInjectedClassNameType(Record));
	return getRecordType(Record);
	} else if (const auto *Enum = dyn_cast<EnumDecl>(Decl)) {
	assert(Enum->isFirstDecl() && "enum has previous declaration");
	return getEnumType(Enum);
	} else if (const auto *Using = dyn_cast<UnresolvedUsingTypenameDecl>(Decl)) {
	Type newType = new (this, TypeAlignment) UnresolvedUsingType(Using);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	} else
	llvm_unreachable("TypeDecl without a type?");

	return QualType(Decl->TypeForDecl, 0);
	}

	/// getTypedefType - Return the unique reference to the type for the
	/// specified typedef name decl.
	QualType
	ASTContext::getTypedefType(const TypedefNameDecl *Decl,
	QualType Canonical) const {
	if (Decl->TypeForDecl) return QualType(Decl->TypeForDecl, 0);

	if (Canonical.isNull())
	Canonical = getCanonicalType(Decl->getUnderlyingType());
	auto newType = new (this, TypeAlignment)
	TypedefType(Type::Typedef, Decl, Canonical);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getRecordType(const RecordDecl *Decl) const {
	if (Decl->TypeForDecl) return QualType(Decl->TypeForDecl, 0);

	if (const RecordDecl *PrevDecl = Decl->getPreviousDecl())
	if (PrevDecl->TypeForDecl)
	return QualType(Decl->TypeForDecl = PrevDecl->TypeForDecl, 0);

	auto newType = new (this, TypeAlignment) RecordType(Decl);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getEnumType(const EnumDecl *Decl) const {
	if (Decl->TypeForDecl) return QualType(Decl->TypeForDecl, 0);

	if (const EnumDecl *PrevDecl = Decl->getPreviousDecl())
	if (PrevDecl->TypeForDecl)
	return QualType(Decl->TypeForDecl = PrevDecl->TypeForDecl, 0);

	auto newType = new (this, TypeAlignment) EnumType(Decl);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getAttributedType(attr::Kind attrKind,
	QualType modifiedType,
	QualType equivalentType) {
	llvm::FoldingSetNodeID id;
	AttributedType::Profile(id, attrKind, modifiedType, equivalentType);

	void *insertPos = nullptr;
	AttributedType *type = AttributedTypes.FindNodeOrInsertPos(id, insertPos);
	if (type) return QualType(type, 0);

	QualType canon = getCanonicalType(equivalentType);
	type = new (*this, TypeAlignment)
	AttributedType(canon, attrKind, modifiedType, equivalentType);

	Types.push_back(type);
	AttributedTypes.InsertNode(type, insertPos);

	return QualType(type, 0);
	}

	/// Retrieve a substitution-result type.
	QualType
	ASTContext::getSubstTemplateTypeParmType(const TemplateTypeParmType *Parm,
	QualType Replacement) const {
	assert(Replacement.isCanonical()
	&& "replacement types must always be canonical");

	llvm::FoldingSetNodeID ID;
	SubstTemplateTypeParmType::Profile(ID, Parm, Replacement);
	void *InsertPos = nullptr;
	SubstTemplateTypeParmType *SubstParm
	= SubstTemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!SubstParm) {
	SubstParm = new (*this, TypeAlignment)
	SubstTemplateTypeParmType(Parm, Replacement);
	Types.push_back(SubstParm);
	SubstTemplateTypeParmTypes.InsertNode(SubstParm, InsertPos);
	}

	return QualType(SubstParm, 0);
	}

	/// Retrieve a
	QualType ASTContext::getSubstTemplateTypeParmPackType(
	const TemplateTypeParmType *Parm,
	const TemplateArgument &ArgPack) {
	#ifndef NDEBUG
	for (const auto &P : ArgPack.pack_elements()) {
	assert(P.getKind() == TemplateArgument::Type &&"Pack contains a non-type");
	assert(P.getAsType().isCanonical() && "Pack contains non-canonical type");
	}
	#endif

	llvm::FoldingSetNodeID ID;
	SubstTemplateTypeParmPackType::Profile(ID, Parm, ArgPack);
	void *InsertPos = nullptr;
	if (SubstTemplateTypeParmPackType *SubstParm
	= SubstTemplateTypeParmPackTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(SubstParm, 0);

	QualType Canon;
	if (!Parm->isCanonicalUnqualified()) {
	Canon = getCanonicalType(QualType(Parm, 0));
	Canon = getSubstTemplateTypeParmPackType(cast<TemplateTypeParmType>(Canon),
	ArgPack);
	SubstTemplateTypeParmPackTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	auto *SubstParm
	= new (*this, TypeAlignment) SubstTemplateTypeParmPackType(Parm, Canon,
	ArgPack);
	Types.push_back(SubstParm);
	SubstTemplateTypeParmPackTypes.InsertNode(SubstParm, InsertPos);
	return QualType(SubstParm, 0);
	}

	/// Retrieve the template type parameter type for a template
	/// parameter or parameter pack with the given depth, index, and (optionally)
	/// name.
	QualType ASTContext::getTemplateTypeParmType(unsigned Depth, unsigned Index,
	bool ParameterPack,
	TemplateTypeParmDecl *TTPDecl) const {
	llvm::FoldingSetNodeID ID;
	TemplateTypeParmType::Profile(ID, Depth, Index, ParameterPack, TTPDecl);
	void *InsertPos = nullptr;
	TemplateTypeParmType *TypeParm
	= TemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (TypeParm)
	return QualType(TypeParm, 0);

	if (TTPDecl) {
	QualType Canon = getTemplateTypeParmType(Depth, Index, ParameterPack);
	TypeParm = new (*this, TypeAlignment) TemplateTypeParmType(TTPDecl, Canon);

	TemplateTypeParmType *TypeCheck
	= TemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!TypeCheck && "Template type parameter canonical type broken");
	(void)TypeCheck;
	} else
	TypeParm = new (*this, TypeAlignment)
	TemplateTypeParmType(Depth, Index, ParameterPack);

	Types.push_back(TypeParm);
	TemplateTypeParmTypes.InsertNode(TypeParm, InsertPos);

	return QualType(TypeParm, 0);
	}

	TypeSourceInfo *
	ASTContext::getTemplateSpecializationTypeInfo(TemplateName Name,
	SourceLocation NameLoc,
	const TemplateArgumentListInfo &Args,
	QualType Underlying) const {
	assert(!Name.getAsDependentTemplateName() &&
	"No dependent template names here!");
	QualType TST = getTemplateSpecializationType(Name, Args, Underlying);

	TypeSourceInfo *DI = CreateTypeSourceInfo(TST);
	TemplateSpecializationTypeLoc TL =
	DI->getTypeLoc().castAs<TemplateSpecializationTypeLoc>();
	TL.setTemplateKeywordLoc(SourceLocation());
	TL.setTemplateNameLoc(NameLoc);
	TL.setLAngleLoc(Args.getLAngleLoc());
	TL.setRAngleLoc(Args.getRAngleLoc());
	for (unsigned i = 0, e = TL.getNumArgs(); i != e; ++i)
	TL.setArgLocInfo(i, Args[i].getLocInfo());
	return DI;
	}

	QualType
	ASTContext::getTemplateSpecializationType(TemplateName Template,
	const TemplateArgumentListInfo &Args,
	QualType Underlying) const {
	assert(!Template.getAsDependentTemplateName() &&
	"No dependent template names here!");

	SmallVector<TemplateArgument, 4> ArgVec;
	ArgVec.reserve(Args.size());
	for (const TemplateArgumentLoc &Arg : Args.arguments())
	ArgVec.push_back(Arg.getArgument());

	return getTemplateSpecializationType(Template, ArgVec, Underlying);
	}

	#ifndef NDEBUG
	static bool hasAnyPackExpansions(ArrayRef<TemplateArgument> Args) {
	for (const TemplateArgument &Arg : Args)
	if (Arg.isPackExpansion())
	return true;

	return true;
	}
	#endif

	QualType
	ASTContext::getTemplateSpecializationType(TemplateName Template,
	ArrayRef<TemplateArgument> Args,
	QualType Underlying) const {
	assert(!Template.getAsDependentTemplateName() &&
	"No dependent template names here!");
	// Look through qualified template names.
	if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName())
	Template = TemplateName(QTN->getTemplateDecl());

	bool IsTypeAlias =
	Template.getAsTemplateDecl() &&
	isa<TypeAliasTemplateDecl>(Template.getAsTemplateDecl());
	QualType CanonType;
	if (!Underlying.isNull())
	CanonType = getCanonicalType(Underlying);
	else {
	// We can get here with an alias template when the specialization contains
	// a pack expansion that does not match up with a parameter pack.
	assert((!IsTypeAlias \|\| hasAnyPackExpansions(Args)) &&
	"Caller must compute aliased type");
	IsTypeAlias = false;
	CanonType = getCanonicalTemplateSpecializationType(Template, Args);
	}

	// Allocate the (non-canonical) template specialization type, but don't
	// try to unique it: these types typically have location information that
	// we don't unique and don't want to lose.
	void *Mem = Allocate(sizeof(TemplateSpecializationType) +
	sizeof(TemplateArgument) * Args.size() +
	(IsTypeAlias? sizeof(QualType) : 0),
	TypeAlignment);
	auto *Spec
	= new (Mem) TemplateSpecializationType(Template, Args, CanonType,
	IsTypeAlias ? Underlying : QualType());

	Types.push_back(Spec);
	return QualType(Spec, 0);
	}

	QualType ASTContext::getCanonicalTemplateSpecializationType(
	TemplateName Template, ArrayRef<TemplateArgument> Args) const {
	assert(!Template.getAsDependentTemplateName() &&
	"No dependent template names here!");

	// Look through qualified template names.
	if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName())
	Template = TemplateName(QTN->getTemplateDecl());

	// Build the canonical template specialization type.
	TemplateName CanonTemplate = getCanonicalTemplateName(Template);
	SmallVector<TemplateArgument, 4> CanonArgs;
	unsigned NumArgs = Args.size();
	CanonArgs.reserve(NumArgs);
	for (const TemplateArgument &Arg : Args)
	CanonArgs.push_back(getCanonicalTemplateArgument(Arg));

	// Determine whether this canonical template specialization type already
	// exists.
	llvm::FoldingSetNodeID ID;
	TemplateSpecializationType::Profile(ID, CanonTemplate,
	CanonArgs, *this);

	void *InsertPos = nullptr;
	TemplateSpecializationType *Spec
	= TemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!Spec) {
	// Allocate a new canonical template specialization type.
	void *Mem = Allocate((sizeof(TemplateSpecializationType) +
	sizeof(TemplateArgument) * NumArgs),
	TypeAlignment);
	Spec = new (Mem) TemplateSpecializationType(CanonTemplate,
	CanonArgs,
	QualType(), QualType());
	Types.push_back(Spec);
	TemplateSpecializationTypes.InsertNode(Spec, InsertPos);
	}

	assert(Spec->isDependentType() &&
	"Non-dependent template-id type must have a canonical type");
	return QualType(Spec, 0);
	}

	QualType ASTContext::getElaboratedType(ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	QualType NamedType,
	TagDecl *OwnedTagDecl) const {
	llvm::FoldingSetNodeID ID;
	ElaboratedType::Profile(ID, Keyword, NNS, NamedType, OwnedTagDecl);

	void *InsertPos = nullptr;
	ElaboratedType *T = ElaboratedTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	QualType Canon = NamedType;
	if (!Canon.isCanonical()) {
	Canon = getCanonicalType(NamedType);
	ElaboratedType *CheckT = ElaboratedTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckT && "Elaborated canonical type broken");
	(void)CheckT;
	}

	void Mem = Allocate(ElaboratedType::totalSizeToAlloc<TagDecl >(!!OwnedTagDecl),
	TypeAlignment);
	T = new (Mem) ElaboratedType(Keyword, NNS, NamedType, Canon, OwnedTagDecl);

	Types.push_back(T);
	ElaboratedTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	QualType
	ASTContext::getParenType(QualType InnerType) const {
	llvm::FoldingSetNodeID ID;
	ParenType::Profile(ID, InnerType);

	void *InsertPos = nullptr;
	ParenType *T = ParenTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	QualType Canon = InnerType;
	if (!Canon.isCanonical()) {
	Canon = getCanonicalType(InnerType);
	ParenType *CheckT = ParenTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckT && "Paren canonical type broken");
	(void)CheckT;
	}

	T = new (*this, TypeAlignment) ParenType(InnerType, Canon);
	Types.push_back(T);
	ParenTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	QualType
	ASTContext::getMacroQualifiedType(QualType UnderlyingTy,
	const IdentifierInfo *MacroII) const {
	QualType Canon = UnderlyingTy;
	if (!Canon.isCanonical())
	Canon = getCanonicalType(UnderlyingTy);

	auto newType = new (this, TypeAlignment)
	MacroQualifiedType(UnderlyingTy, Canon, MacroII);
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	const IdentifierInfo *Name,
	QualType Canon) const {
	if (Canon.isNull()) {
	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
	if (CanonNNS != NNS)
	Canon = getDependentNameType(Keyword, CanonNNS, Name);
	}

	llvm::FoldingSetNodeID ID;
	DependentNameType::Profile(ID, Keyword, NNS, Name);

	void *InsertPos = nullptr;
	DependentNameType *T
	= DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	T = new (*this, TypeAlignment) DependentNameType(Keyword, NNS, Name, Canon);
	Types.push_back(T);
	DependentNameTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	QualType
	ASTContext::getDependentTemplateSpecializationType(
	ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	const IdentifierInfo *Name,
	const TemplateArgumentListInfo &Args) const {
	// TODO: avoid this copy
	SmallVector<TemplateArgument, 16> ArgCopy;
	for (unsigned I = 0, E = Args.size(); I != E; ++I)
	ArgCopy.push_back(Args[I].getArgument());
	return getDependentTemplateSpecializationType(Keyword, NNS, Name, ArgCopy);
	}

	QualType
	ASTContext::getDependentTemplateSpecializationType(
	ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	const IdentifierInfo *Name,
	ArrayRef<TemplateArgument> Args) const {
	assert((!NNS \|\| NNS->isDependent()) &&
	"nested-name-specifier must be dependent");

	llvm::FoldingSetNodeID ID;
	DependentTemplateSpecializationType::Profile(ID, *this, Keyword, NNS,
	Name, Args);

	void *InsertPos = nullptr;
	DependentTemplateSpecializationType *T
	= DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);

	ElaboratedTypeKeyword CanonKeyword = Keyword;
	if (Keyword == ETK_None) CanonKeyword = ETK_Typename;

	bool AnyNonCanonArgs = false;
	unsigned NumArgs = Args.size();
	SmallVector<TemplateArgument, 16> CanonArgs(NumArgs);
	for (unsigned I = 0; I != NumArgs; ++I) {
	CanonArgs[I] = getCanonicalTemplateArgument(Args[I]);
	if (!CanonArgs[I].structurallyEquals(Args[I]))
	AnyNonCanonArgs = true;
	}

	QualType Canon;
	if (AnyNonCanonArgs \|\| CanonNNS != NNS \|\| CanonKeyword != Keyword) {
	Canon = getDependentTemplateSpecializationType(CanonKeyword, CanonNNS,
	Name,
	CanonArgs);

	// Find the insert position again.
	DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	void *Mem = Allocate((sizeof(DependentTemplateSpecializationType) +
	sizeof(TemplateArgument) * NumArgs),
	TypeAlignment);
	T = new (Mem) DependentTemplateSpecializationType(Keyword, NNS,
	Name, Args, Canon);
	Types.push_back(T);
	DependentTemplateSpecializationTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) {
	TemplateArgument Arg;
	if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
	QualType ArgType = getTypeDeclType(TTP);
	if (TTP->isParameterPack())
	ArgType = getPackExpansionType(ArgType, None);

	Arg = TemplateArgument(ArgType);
	} else if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
	Expr E = new (this) DeclRefExpr(
	this, NTTP, /enclosing*/ false,
	NTTP->getType().getNonPackExpansionType().getNonLValueExprType(*this),
	Expr::getValueKindForType(NTTP->getType()), NTTP->getLocation());

	if (NTTP->isParameterPack())
	E = new (*this) PackExpansionExpr(DependentTy, E, NTTP->getLocation(),
	None);
	Arg = TemplateArgument(E);
	} else {
	auto *TTP = cast<TemplateTemplateParmDecl>(Param);
	if (TTP->isParameterPack())
	Arg = TemplateArgument(TemplateName(TTP), Optional<unsigned>());
	else
	Arg = TemplateArgument(TemplateName(TTP));
	}

	if (Param->isTemplateParameterPack())
	Arg = TemplateArgument::CreatePackCopy(*this, Arg);

	return Arg;
	}

	void
	ASTContext::getInjectedTemplateArgs(const TemplateParameterList *Params,
	SmallVectorImpl<TemplateArgument> &Args) {
	Args.reserve(Args.size() + Params->size());

	for (NamedDecl Param : Params)
	Args.push_back(getInjectedTemplateArg(Param));
	}

	QualType ASTContext::getPackExpansionType(QualType Pattern,
	Optional<unsigned> NumExpansions) {
	llvm::FoldingSetNodeID ID;
	PackExpansionType::Profile(ID, Pattern, NumExpansions);

	// A deduced type can deduce to a pack, eg
	// auto ...x = some_pack;
	// That declaration isn't (yet) valid, but is created as part of building an
	// init-capture pack:
	// [...x = some_pack] {}
	assert((Pattern->containsUnexpandedParameterPack() \|\|
	Pattern->getContainedDeducedType()) &&
	"Pack expansions must expand one or more parameter packs");
	void *InsertPos = nullptr;
	PackExpansionType *T
	= PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	QualType Canon;
	if (!Pattern.isCanonical()) {
	Canon = getCanonicalType(Pattern);
	// The canonical type might not contain an unexpanded parameter pack, if it
	// contains an alias template specialization which ignores one of its
	// parameters.
	if (Canon->containsUnexpandedParameterPack()) {
	Canon = getPackExpansionType(Canon, NumExpansions);

	// Find the insert position again, in case we inserted an element into
	// PackExpansionTypes and invalidated our insert position.
	PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos);
	}
	}

	T = new (*this, TypeAlignment)
	PackExpansionType(Pattern, Canon, NumExpansions);
	Types.push_back(T);
	PackExpansionTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	/// CmpProtocolNames - Comparison predicate for sorting protocols
	/// alphabetically.
	static int CmpProtocolNames(ObjCProtocolDecl const LHS,
	ObjCProtocolDecl const RHS) {
	return DeclarationName::compare((LHS)->getDeclName(), (RHS)->getDeclName());
	}

	static bool areSortedAndUniqued(ArrayRef<ObjCProtocolDecl *> Protocols) {
	if (Protocols.empty()) return true;

	if (Protocols[0]->getCanonicalDecl() != Protocols[0])
	return false;

	for (unsigned i = 1; i != Protocols.size(); ++i)
	if (CmpProtocolNames(&Protocols[i - 1], &Protocols[i]) >= 0 \|\|
	Protocols[i]->getCanonicalDecl() != Protocols[i])
	return false;
	return true;
	}

	static void
	SortAndUniqueProtocols(SmallVectorImpl<ObjCProtocolDecl *> &Protocols) {
	// Sort protocols, keyed by name.
	llvm::array_pod_sort(Protocols.begin(), Protocols.end(), CmpProtocolNames);

	// Canonicalize.
	for (ObjCProtocolDecl *&P : Protocols)
	P = P->getCanonicalDecl();

	// Remove duplicates.
	auto ProtocolsEnd = std::unique(Protocols.begin(), Protocols.end());
	Protocols.erase(ProtocolsEnd, Protocols.end());
	}

	QualType ASTContext::getObjCObjectType(QualType BaseType,
	ObjCProtocolDecl * const *Protocols,
	unsigned NumProtocols) const {
	return getObjCObjectType(BaseType, {},
	llvm::makeArrayRef(Protocols, NumProtocols),
	/isKindOf=/false);
	}

	QualType ASTContext::getObjCObjectType(
	QualType baseType,
	ArrayRef<QualType> typeArgs,
	ArrayRef<ObjCProtocolDecl *> protocols,
	bool isKindOf) const {
	// If the base type is an interface and there aren't any protocols or
	// type arguments to add, then the interface type will do just fine.
	if (typeArgs.empty() && protocols.empty() && !isKindOf &&
	isa<ObjCInterfaceType>(baseType))
	return baseType;

	// Look in the folding set for an existing type.
	llvm::FoldingSetNodeID ID;
	ObjCObjectTypeImpl::Profile(ID, baseType, typeArgs, protocols, isKindOf);
	void *InsertPos = nullptr;
	if (ObjCObjectType *QT = ObjCObjectTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(QT, 0);

	// Determine the type arguments to be used for canonicalization,
	// which may be explicitly specified here or written on the base
	// type.
	ArrayRef<QualType> effectiveTypeArgs = typeArgs;
	if (effectiveTypeArgs.empty()) {
	if (const auto *baseObject = baseType->getAs<ObjCObjectType>())
	effectiveTypeArgs = baseObject->getTypeArgs();
	}

	// Build the canonical type, which has the canonical base type and a
	// sorted-and-uniqued list of protocols and the type arguments
	// canonicalized.
	QualType canonical;
	bool typeArgsAreCanonical = std::all_of(effectiveTypeArgs.begin(),
	effectiveTypeArgs.end(),
	[&](QualType type) {
	return type.isCanonical();
	});
	bool protocolsSorted = areSortedAndUniqued(protocols);
	if (!typeArgsAreCanonical \|\| !protocolsSorted \|\| !baseType.isCanonical()) {
	// Determine the canonical type arguments.
	ArrayRef<QualType> canonTypeArgs;
	SmallVector<QualType, 4> canonTypeArgsVec;
	if (!typeArgsAreCanonical) {
	canonTypeArgsVec.reserve(effectiveTypeArgs.size());
	for (auto typeArg : effectiveTypeArgs)
	canonTypeArgsVec.push_back(getCanonicalType(typeArg));
	canonTypeArgs = canonTypeArgsVec;
	} else {
	canonTypeArgs = effectiveTypeArgs;
	}

	ArrayRef<ObjCProtocolDecl *> canonProtocols;
	SmallVector<ObjCProtocolDecl*, 8> canonProtocolsVec;
	if (!protocolsSorted) {
	canonProtocolsVec.append(protocols.begin(), protocols.end());
	SortAndUniqueProtocols(canonProtocolsVec);
	canonProtocols = canonProtocolsVec;
	} else {
	canonProtocols = protocols;
	}

	canonical = getObjCObjectType(getCanonicalType(baseType), canonTypeArgs,
	canonProtocols, isKindOf);

	// Regenerate InsertPos.
	ObjCObjectTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	unsigned size = sizeof(ObjCObjectTypeImpl);
	size += typeArgs.size() * sizeof(QualType);
	size += protocols.size() * sizeof(ObjCProtocolDecl *);
	void *mem = Allocate(size, TypeAlignment);
	auto *T =
	new (mem) ObjCObjectTypeImpl(canonical, baseType, typeArgs, protocols,
	isKindOf);

	Types.push_back(T);
	ObjCObjectTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	/// Apply Objective-C protocol qualifiers to the given type.
	/// If this is for the canonical type of a type parameter, we can apply
	/// protocol qualifiers on the ObjCObjectPointerType.
	QualType
	ASTContext::applyObjCProtocolQualifiers(QualType type,
	ArrayRef<ObjCProtocolDecl *> protocols, bool &hasError,
	bool allowOnPointerType) const {
	hasError = false;

	if (const auto *objT = dyn_cast<ObjCTypeParamType>(type.getTypePtr())) {
	return getObjCTypeParamType(objT->getDecl(), protocols);
	}

	// Apply protocol qualifiers to ObjCObjectPointerType.
	if (allowOnPointerType) {
	if (const auto *objPtr =
	dyn_cast<ObjCObjectPointerType>(type.getTypePtr())) {
	const ObjCObjectType *objT = objPtr->getObjectType();
	// Merge protocol lists and construct ObjCObjectType.
	SmallVector<ObjCProtocolDecl*, 8> protocolsVec;
	protocolsVec.append(objT->qual_begin(),
	objT->qual_end());
	protocolsVec.append(protocols.begin(), protocols.end());
	ArrayRef<ObjCProtocolDecl *> protocols = protocolsVec;
	type = getObjCObjectType(
	objT->getBaseType(),
	objT->getTypeArgsAsWritten(),
	protocols,
	objT->isKindOfTypeAsWritten());
	return getObjCObjectPointerType(type);
	}
	}

	// Apply protocol qualifiers to ObjCObjectType.
	if (const auto *objT = dyn_cast<ObjCObjectType>(type.getTypePtr())){
	// FIXME: Check for protocols to which the class type is already
	// known to conform.

	return getObjCObjectType(objT->getBaseType(),
	objT->getTypeArgsAsWritten(),
	protocols,
	objT->isKindOfTypeAsWritten());
	}

	// If the canonical type is ObjCObjectType, ...
	if (type->isObjCObjectType()) {
	// Silently overwrite any existing protocol qualifiers.
	// TODO: determine whether that's the right thing to do.

	// FIXME: Check for protocols to which the class type is already
	// known to conform.
	return getObjCObjectType(type, {}, protocols, false);
	}

	// id<protocol-list>
	if (type->isObjCIdType()) {
	const auto *objPtr = type->castAs<ObjCObjectPointerType>();
	type = getObjCObjectType(ObjCBuiltinIdTy, {}, protocols,
	objPtr->isKindOfType());
	return getObjCObjectPointerType(type);
	}

	// Class<protocol-list>
	if (type->isObjCClassType()) {
	const auto *objPtr = type->castAs<ObjCObjectPointerType>();
	type = getObjCObjectType(ObjCBuiltinClassTy, {}, protocols,
	objPtr->isKindOfType());
	return getObjCObjectPointerType(type);
	}

	hasError = true;
	return type;
	}

	QualType
	ASTContext::getObjCTypeParamType(const ObjCTypeParamDecl *Decl,
	ArrayRef<ObjCProtocolDecl *> protocols) const {
	// Look in the folding set for an existing type.
	llvm::FoldingSetNodeID ID;
	ObjCTypeParamType::Profile(ID, Decl, Decl->getUnderlyingType(), protocols);
	void *InsertPos = nullptr;
	if (ObjCTypeParamType *TypeParam =
	ObjCTypeParamTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(TypeParam, 0);

	// We canonicalize to the underlying type.
	QualType Canonical = getCanonicalType(Decl->getUnderlyingType());
	if (!protocols.empty()) {
	// Apply the protocol qualifers.
	bool hasError;
	Canonical = getCanonicalType(applyObjCProtocolQualifiers(
	Canonical, protocols, hasError, true /allowOnPointerType/));
	assert(!hasError && "Error when apply protocol qualifier to bound type");
	}

	unsigned size = sizeof(ObjCTypeParamType);
	size += protocols.size() * sizeof(ObjCProtocolDecl *);
	void *mem = Allocate(size, TypeAlignment);
	auto *newType = new (mem) ObjCTypeParamType(Decl, Canonical, protocols);

	Types.push_back(newType);
	ObjCTypeParamTypes.InsertNode(newType, InsertPos);
	return QualType(newType, 0);
	}

	void ASTContext::adjustObjCTypeParamBoundType(const ObjCTypeParamDecl *Orig,
	ObjCTypeParamDecl *New) const {
	New->setTypeSourceInfo(getTrivialTypeSourceInfo(Orig->getUnderlyingType()));
	// Update TypeForDecl after updating TypeSourceInfo.
	auto NewTypeParamTy = cast<ObjCTypeParamType>(New->getTypeForDecl());
	SmallVector<ObjCProtocolDecl *, 8> protocols;
	protocols.append(NewTypeParamTy->qual_begin(), NewTypeParamTy->qual_end());
	QualType UpdatedTy = getObjCTypeParamType(New, protocols);
	New->setTypeForDecl(UpdatedTy.getTypePtr());
	}

	/// ObjCObjectAdoptsQTypeProtocols - Checks that protocols in IC's
	/// protocol list adopt all protocols in QT's qualified-id protocol
	/// list.
	bool ASTContext::ObjCObjectAdoptsQTypeProtocols(QualType QT,
	ObjCInterfaceDecl *IC) {
	if (!QT->isObjCQualifiedIdType())
	return false;

	if (const auto *OPT = QT->getAs<ObjCObjectPointerType>()) {
	// If both the right and left sides have qualifiers.
	for (auto *Proto : OPT->quals()) {
	if (!IC->ClassImplementsProtocol(Proto, false))
	return false;
	}
	return true;
	}
	return false;
	}

	/// QIdProtocolsAdoptObjCObjectProtocols - Checks that protocols in
	/// QT's qualified-id protocol list adopt all protocols in IDecl's list
	/// of protocols.
	bool ASTContext::QIdProtocolsAdoptObjCObjectProtocols(QualType QT,
	ObjCInterfaceDecl *IDecl) {
	if (!QT->isObjCQualifiedIdType())
	return false;
	const auto *OPT = QT->getAs<ObjCObjectPointerType>();
	if (!OPT)
	return false;
	if (!IDecl->hasDefinition())
	return false;
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> InheritedProtocols;
	CollectInheritedProtocols(IDecl, InheritedProtocols);
	if (InheritedProtocols.empty())
	return false;
	// Check that if every protocol in list of id<plist> conforms to a protocol
	// of IDecl's, then bridge casting is ok.
	bool Conforms = false;
	for (auto *Proto : OPT->quals()) {
	Conforms = false;
	for (auto *PI : InheritedProtocols) {
	if (ProtocolCompatibleWithProtocol(Proto, PI)) {
	Conforms = true;
	break;
	}
	}
	if (!Conforms)
	break;
	}
	if (Conforms)
	return true;

	for (auto *PI : InheritedProtocols) {
	// If both the right and left sides have qualifiers.
	bool Adopts = false;
	for (auto *Proto : OPT->quals()) {
	// return 'true' if 'PI' is in the inheritance hierarchy of Proto
	if ((Adopts = ProtocolCompatibleWithProtocol(PI, Proto)))
	break;
	}
	if (!Adopts)
	return false;
	}
	return true;
	}

	/// getObjCObjectPointerType - Return a ObjCObjectPointerType type for
	/// the given object type.
	QualType ASTContext::getObjCObjectPointerType(QualType ObjectT) const {
	llvm::FoldingSetNodeID ID;
	ObjCObjectPointerType::Profile(ID, ObjectT);

	void *InsertPos = nullptr;
	if (ObjCObjectPointerType *QT =
	ObjCObjectPointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(QT, 0);

	// Find the canonical object type.
	QualType Canonical;
	if (!ObjectT.isCanonical()) {
	Canonical = getObjCObjectPointerType(getCanonicalType(ObjectT));

	// Regenerate InsertPos.
	ObjCObjectPointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	// No match.
	void *Mem = Allocate(sizeof(ObjCObjectPointerType), TypeAlignment);
	auto *QType =
	new (Mem) ObjCObjectPointerType(Canonical, ObjectT);

	Types.push_back(QType);
	ObjCObjectPointerTypes.InsertNode(QType, InsertPos);
	return QualType(QType, 0);
	}

	/// getObjCInterfaceType - Return the unique reference to the type for the
	/// specified ObjC interface decl. The list of protocols is optional.
	QualType ASTContext::getObjCInterfaceType(const ObjCInterfaceDecl *Decl,
	ObjCInterfaceDecl *PrevDecl) const {
	if (Decl->TypeForDecl)
	return QualType(Decl->TypeForDecl, 0);

	if (PrevDecl) {
	assert(PrevDecl->TypeForDecl && "previous decl has no TypeForDecl");
	Decl->TypeForDecl = PrevDecl->TypeForDecl;
	return QualType(PrevDecl->TypeForDecl, 0);
	}

	// Prefer the definition, if there is one.
	if (const ObjCInterfaceDecl *Def = Decl->getDefinition())
	Decl = Def;

	void *Mem = Allocate(sizeof(ObjCInterfaceType), TypeAlignment);
	auto *T = new (Mem) ObjCInterfaceType(Decl);
	Decl->TypeForDecl = T;
	Types.push_back(T);
	return QualType(T, 0);
	}

	/// getTypeOfExprType - Unlike many "get<Type>" functions, we can't unique
	/// TypeOfExprType AST's (since expression's are never shared). For example,
	/// multiple declarations that refer to "typeof(x)" all contain different
	/// DeclRefExpr's. This doesn't effect the type checker, since it operates
	/// on canonical type's (which are always unique).
	QualType ASTContext::getTypeOfExprType(Expr *tofExpr) const {
	TypeOfExprType *toe;
	if (tofExpr->isTypeDependent()) {
	llvm::FoldingSetNodeID ID;
	DependentTypeOfExprType::Profile(ID, *this, tofExpr);

	void *InsertPos = nullptr;
	DependentTypeOfExprType *Canon
	= DependentTypeOfExprTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (Canon) {
	// We already have a "canonical" version of an identical, dependent
	// typeof(expr) type. Use that as our canonical type.
	toe = new (*this, TypeAlignment) TypeOfExprType(tofExpr,
	QualType((TypeOfExprType*)Canon, 0));
	} else {
	// Build a new, canonical typeof(expr) type.
	Canon
	= new (this, TypeAlignment) DependentTypeOfExprType(this, tofExpr);
	DependentTypeOfExprTypes.InsertNode(Canon, InsertPos);
	toe = Canon;
	}
	} else {
	QualType Canonical = getCanonicalType(tofExpr->getType());
	toe = new (*this, TypeAlignment) TypeOfExprType(tofExpr, Canonical);
	}
	Types.push_back(toe);
	return QualType(toe, 0);
	}

	/// getTypeOfType - Unlike many "get<Type>" functions, we don't unique
	/// TypeOfType nodes. The only motivation to unique these nodes would be
	/// memory savings. Since typeof(t) is fairly uncommon, space shouldn't be
	/// an issue. This doesn't affect the type checker, since it operates
	/// on canonical types (which are always unique).
	QualType ASTContext::getTypeOfType(QualType tofType) const {
	QualType Canonical = getCanonicalType(tofType);
	auto tot = new (this, TypeAlignment) TypeOfType(tofType, Canonical);
	Types.push_back(tot);
	return QualType(tot, 0);
	}

	/// Unlike many "get<Type>" functions, we don't unique DecltypeType
	/// nodes. This would never be helpful, since each such type has its own
	/// expression, and would not give a significant memory saving, since there
	/// is an Expr tree under each such type.
	QualType ASTContext::getDecltypeType(Expr *e, QualType UnderlyingType) const {
	DecltypeType *dt;

	// C++11 [temp.type]p2:
	// If an expression e involves a template parameter, decltype(e) denotes a
	// unique dependent type. Two such decltype-specifiers refer to the same
	// type only if their expressions are equivalent (14.5.6.1).
	if (e->isInstantiationDependent()) {
	llvm::FoldingSetNodeID ID;
	DependentDecltypeType::Profile(ID, *this, e);

	void *InsertPos = nullptr;
	DependentDecltypeType *Canon
	= DependentDecltypeTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (!Canon) {
	// Build a new, canonical decltype(expr) type.
	Canon = new (this, TypeAlignment) DependentDecltypeType(this, e);
	DependentDecltypeTypes.InsertNode(Canon, InsertPos);
	}
	dt = new (*this, TypeAlignment)
	DecltypeType(e, UnderlyingType, QualType((DecltypeType *)Canon, 0));
	} else {
	dt = new (*this, TypeAlignment)
	DecltypeType(e, UnderlyingType, getCanonicalType(UnderlyingType));
	}
	Types.push_back(dt);
	return QualType(dt, 0);
	}

	/// getUnaryTransformationType - We don't unique these, since the memory
	/// savings are minimal and these are rare.
	QualType ASTContext::getUnaryTransformType(QualType BaseType,
	QualType UnderlyingType,
	UnaryTransformType::UTTKind Kind)
	const {
	UnaryTransformType *ut = nullptr;

	if (BaseType->isDependentType()) {
	// Look in the folding set for an existing type.
	llvm::FoldingSetNodeID ID;
	DependentUnaryTransformType::Profile(ID, getCanonicalType(BaseType), Kind);

	void *InsertPos = nullptr;
	DependentUnaryTransformType *Canon
	= DependentUnaryTransformTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!Canon) {
	// Build a new, canonical __underlying_type(type) type.
	Canon = new (*this, TypeAlignment)
	DependentUnaryTransformType(*this, getCanonicalType(BaseType),
	Kind);
	DependentUnaryTransformTypes.InsertNode(Canon, InsertPos);
	}
	ut = new (*this, TypeAlignment) UnaryTransformType (BaseType,
	QualType(), Kind,
	QualType(Canon, 0));
	} else {
	QualType CanonType = getCanonicalType(UnderlyingType);
	ut = new (*this, TypeAlignment) UnaryTransformType (BaseType,
	UnderlyingType, Kind,
	CanonType);
	}
	Types.push_back(ut);
	return QualType(ut, 0);
	}

	/// getAutoType - Return the uniqued reference to the 'auto' type which has been
	/// deduced to the given type, or to the canonical undeduced 'auto' type, or the
	/// canonical deduced-but-dependent 'auto' type.
	QualType
	ASTContext::getAutoType(QualType DeducedType, AutoTypeKeyword Keyword,
	bool IsDependent, bool IsPack,
	ConceptDecl *TypeConstraintConcept,
	ArrayRef<TemplateArgument> TypeConstraintArgs) const {
	assert((!IsPack \|\| IsDependent) && "only use IsPack for a dependent pack");
	if (DeducedType.isNull() && Keyword == AutoTypeKeyword::Auto &&
	!TypeConstraintConcept && !IsDependent)
	return getAutoDeductType();

	// Look in the folding set for an existing type.
	void *InsertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	AutoType::Profile(ID, *this, DeducedType, Keyword, IsDependent,
	TypeConstraintConcept, TypeConstraintArgs);
	if (AutoType *AT = AutoTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(AT, 0);

	void *Mem = Allocate(sizeof(AutoType) +
	sizeof(TemplateArgument) * TypeConstraintArgs.size(),
	TypeAlignment);
	auto *AT = new (Mem) AutoType(
	DeducedType, Keyword,
	(IsDependent ? TypeDependence::DependentInstantiation
	: TypeDependence::None) \|
	(IsPack ? TypeDependence::UnexpandedPack : TypeDependence::None),
	TypeConstraintConcept, TypeConstraintArgs);
	Types.push_back(AT);
	if (InsertPos)
	AutoTypes.InsertNode(AT, InsertPos);
	return QualType(AT, 0);
	}

	/// Return the uniqued reference to the deduced template specialization type
	/// which has been deduced to the given type, or to the canonical undeduced
	/// such type, or the canonical deduced-but-dependent such type.
	QualType ASTContext::getDeducedTemplateSpecializationType(
	TemplateName Template, QualType DeducedType, bool IsDependent) const {
	// Look in the folding set for an existing type.
	void *InsertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	DeducedTemplateSpecializationType::Profile(ID, Template, DeducedType,
	IsDependent);
	if (DeducedTemplateSpecializationType *DTST =
	DeducedTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(DTST, 0);

	auto DTST = new (this, TypeAlignment)
	DeducedTemplateSpecializationType(Template, DeducedType, IsDependent);
	Types.push_back(DTST);
	if (InsertPos)
	DeducedTemplateSpecializationTypes.InsertNode(DTST, InsertPos);
	return QualType(DTST, 0);
	}

	/// getAtomicType - Return the uniqued reference to the atomic type for
	/// the given value type.
	QualType ASTContext::getAtomicType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	AtomicType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (AtomicType *AT = AtomicTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(AT, 0);

	// If the atomic value type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getAtomicType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	AtomicType *NewIP = AtomicTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	auto New = new (this, TypeAlignment) AtomicType(T, Canonical);
	Types.push_back(New);
	AtomicTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getAutoDeductType - Get type pattern for deducing against 'auto'.
	QualType ASTContext::getAutoDeductType() const {
	if (AutoDeductTy.isNull())
	AutoDeductTy = QualType(new (*this, TypeAlignment)
	AutoType(QualType(), AutoTypeKeyword::Auto,
	TypeDependence::None,
	/concept/ nullptr, /args/ {}),
	0);
	return AutoDeductTy;
	}

	/// getAutoRRefDeductType - Get type pattern for deducing against 'auto &&'.
	QualType ASTContext::getAutoRRefDeductType() const {
	if (AutoRRefDeductTy.isNull())
	AutoRRefDeductTy = getRValueReferenceType(getAutoDeductType());
	assert(!AutoRRefDeductTy.isNull() && "can't build 'auto &&' pattern");
	return AutoRRefDeductTy;
	}

	/// getTagDeclType - Return the unique reference to the type for the
	/// specified TagDecl (struct/union/class/enum) decl.
	QualType ASTContext::getTagDeclType(const TagDecl *Decl) const {
	assert(Decl);
	// FIXME: What is the design on getTagDeclType when it requires casting
	// away const? mutable?
	return getTypeDeclType(const_cast<TagDecl*>(Decl));
	}

	/// getSizeType - Return the unique type for "size_t" (C99 7.17), the result
	/// of the sizeof operator (C99 6.5.3.4p4). The value is target dependent and
	/// needs to agree with the definition in <stddef.h>.
	CanQualType ASTContext::getSizeType() const {
	return getFromTargetType(Target->getSizeType());
	}

	/// Return the unique signed counterpart of the integer type
	/// corresponding to size_t.
	CanQualType ASTContext::getSignedSizeType() const {
	return getFromTargetType(Target->getSignedSizeType());
	}

	/// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5).
	CanQualType ASTContext::getIntMaxType() const {
	return getFromTargetType(Target->getIntMaxType());
	}

	/// getUIntMaxType - Return the unique type for "uintmax_t" (C99 7.18.1.5).
	CanQualType ASTContext::getUIntMaxType() const {
	return getFromTargetType(Target->getUIntMaxType());
	}

	/// getSignedWCharType - Return the type of "signed wchar_t".
	/// Used when in C++, as a GCC extension.
	QualType ASTContext::getSignedWCharType() const {
	// FIXME: derive from "Target" ?
	return WCharTy;
	}

	/// getUnsignedWCharType - Return the type of "unsigned wchar_t".
	/// Used when in C++, as a GCC extension.
	QualType ASTContext::getUnsignedWCharType() const {
	// FIXME: derive from "Target" ?
	return UnsignedIntTy;
	}

	QualType ASTContext::getIntPtrType() const {
	return getFromTargetType(Target->getIntPtrType());
	}

	QualType ASTContext::getUIntPtrType() const {
	return getCorrespondingUnsignedType(getIntPtrType());
	}

	/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17)
	/// defined in <stddef.h>. Pointer - pointer requires this (C99 6.5.6p9).
	QualType ASTContext::getPointerDiffType() const {
	return getFromTargetType(Target->getPtrDiffType(0));
	}

	/// Return the unique unsigned counterpart of "ptrdiff_t"
	/// integer type. The standard (C11 7.21.6.1p7) refers to this type
	/// in the definition of %tu format specifier.
	QualType ASTContext::getUnsignedPointerDiffType() const {
	return getFromTargetType(Target->getUnsignedPtrDiffType(0));
	}

	/// Return the unique type for "pid_t" defined in
	/// <sys/types.h>. We need this to compute the correct type for vfork().
	QualType ASTContext::getProcessIDType() const {
	return getFromTargetType(Target->getProcessIDType());
	}

	//===----------------------------------------------------------------------===//
	// Type Operators
	//===----------------------------------------------------------------------===//

	CanQualType ASTContext::getCanonicalParamType(QualType T) const {
	// Push qualifiers into arrays, and then discard any remaining
	// qualifiers.
	T = getCanonicalType(T);
	T = getVariableArrayDecayedType(T);
	const Type *Ty = T.getTypePtr();
	QualType Result;
	if (isa<ArrayType>(Ty)) {
	Result = getArrayDecayedType(QualType(Ty,0));
	} else if (isa<FunctionType>(Ty)) {
	Result = getPointerType(QualType(Ty, 0));
	} else {
	Result = QualType(Ty, 0);
	}

	return CanQualType::CreateUnsafe(Result);
	}

	QualType ASTContext::getUnqualifiedArrayType(QualType type,
	Qualifiers &quals) {
	SplitQualType splitType = type.getSplitUnqualifiedType();

	// FIXME: getSplitUnqualifiedType() actually walks all the way to
	// the unqualified desugared type and then drops it on the floor.
	// We then have to strip that sugar back off with
	// getUnqualifiedDesugaredType(), which is silly.
	const auto *AT =
	dyn_cast<ArrayType>(splitType.Ty->getUnqualifiedDesugaredType());

	// If we don't have an array, just use the results in splitType.
	if (!AT) {
	quals = splitType.Quals;
	return QualType(splitType.Ty, 0);
	}

	// Otherwise, recurse on the array's element type.
	QualType elementType = AT->getElementType();
	QualType unqualElementType = getUnqualifiedArrayType(elementType, quals);

	// If that didn't change the element type, AT has no qualifiers, so we
	// can just use the results in splitType.
	if (elementType == unqualElementType) {
	assert(quals.empty()); // from the recursive call
	quals = splitType.Quals;
	return QualType(splitType.Ty, 0);
	}

	// Otherwise, add in the qualifiers from the outermost type, then
	// build the type back up.
	quals.addConsistentQualifiers(splitType.Quals);

	if (const auto *CAT = dyn_cast<ConstantArrayType>(AT)) {
	return getConstantArrayType(unqualElementType, CAT->getSize(),
	CAT->getSizeExpr(), CAT->getSizeModifier(), 0);
	}

	if (const auto *IAT = dyn_cast<IncompleteArrayType>(AT)) {
	return getIncompleteArrayType(unqualElementType, IAT->getSizeModifier(), 0);
	}

	if (const auto *VAT = dyn_cast<VariableArrayType>(AT)) {
	return getVariableArrayType(unqualElementType,
	VAT->getSizeExpr(),
	VAT->getSizeModifier(),
	VAT->getIndexTypeCVRQualifiers(),
	VAT->getBracketsRange());
	}

	const auto *DSAT = cast<DependentSizedArrayType>(AT);
	return getDependentSizedArrayType(unqualElementType, DSAT->getSizeExpr(),
	DSAT->getSizeModifier(), 0,
	SourceRange());
	}

	/// Attempt to unwrap two types that may both be array types with the same bound
	/// (or both be array types of unknown bound) for the purpose of comparing the
	/// cv-decomposition of two types per C++ [conv.qual].
	bool ASTContext::UnwrapSimilarArrayTypes(QualType &T1, QualType &T2) {
	bool UnwrappedAny = false;
	while (true) {
	auto *AT1 = getAsArrayType(T1);
	if (!AT1) return UnwrappedAny;

	auto *AT2 = getAsArrayType(T2);
	if (!AT2) return UnwrappedAny;

	// If we don't have two array types with the same constant bound nor two
	// incomplete array types, we've unwrapped everything we can.
	if (auto *CAT1 = dyn_cast<ConstantArrayType>(AT1)) {
	auto *CAT2 = dyn_cast<ConstantArrayType>(AT2);
	if (!CAT2 \|\| CAT1->getSize() != CAT2->getSize())
	return UnwrappedAny;
	} else if (!isa<IncompleteArrayType>(AT1) \|\|
	!isa<IncompleteArrayType>(AT2)) {
	return UnwrappedAny;
	}

	T1 = AT1->getElementType();
	T2 = AT2->getElementType();
	UnwrappedAny = true;
	}
	}

	/// Attempt to unwrap two types that may be similar (C++ [conv.qual]).
	///
	/// If T1 and T2 are both pointer types of the same kind, or both array types
	/// with the same bound, unwraps layers from T1 and T2 until a pointer type is
	/// unwrapped. Top-level qualifiers on T1 and T2 are ignored.
	///
	/// This function will typically be called in a loop that successively
	/// "unwraps" pointer and pointer-to-member types to compare them at each
	/// level.
	///
	/// \return \c true if a pointer type was unwrapped, \c false if we reached a
	/// pair of types that can't be unwrapped further.
	bool ASTContext::UnwrapSimilarTypes(QualType &T1, QualType &T2) {
	UnwrapSimilarArrayTypes(T1, T2);

	const auto *T1PtrType = T1->getAs<PointerType>();
	const auto *T2PtrType = T2->getAs<PointerType>();
	if (T1PtrType && T2PtrType) {
	T1 = T1PtrType->getPointeeType();
	T2 = T2PtrType->getPointeeType();
	return true;
	}

	const auto *T1MPType = T1->getAs<MemberPointerType>();
	const auto *T2MPType = T2->getAs<MemberPointerType>();
	if (T1MPType && T2MPType &&
	hasSameUnqualifiedType(QualType(T1MPType->getClass(), 0),
	QualType(T2MPType->getClass(), 0))) {
	T1 = T1MPType->getPointeeType();
	T2 = T2MPType->getPointeeType();
	return true;
	}

	if (getLangOpts().ObjC) {
	const auto *T1OPType = T1->getAs<ObjCObjectPointerType>();
	const auto *T2OPType = T2->getAs<ObjCObjectPointerType>();
	if (T1OPType && T2OPType) {
	T1 = T1OPType->getPointeeType();
	T2 = T2OPType->getPointeeType();
	return true;
	}
	}

	// FIXME: Block pointers, too?

	return false;
	}

	bool ASTContext::hasSimilarType(QualType T1, QualType T2) {
	while (true) {
	Qualifiers Quals;
	T1 = getUnqualifiedArrayType(T1, Quals);
	T2 = getUnqualifiedArrayType(T2, Quals);
	if (hasSameType(T1, T2))
	return true;
	if (!UnwrapSimilarTypes(T1, T2))
	return false;
	}
	}

	bool ASTContext::hasCvrSimilarType(QualType T1, QualType T2) {
	while (true) {
	Qualifiers Quals1, Quals2;
	T1 = getUnqualifiedArrayType(T1, Quals1);
	T2 = getUnqualifiedArrayType(T2, Quals2);

	Quals1.removeCVRQualifiers();
	Quals2.removeCVRQualifiers();
	if (Quals1 != Quals2)
	return false;

	if (hasSameType(T1, T2))
	return true;

	if (!UnwrapSimilarTypes(T1, T2))
	return false;
	}
	}

	DeclarationNameInfo
	ASTContext::getNameForTemplate(TemplateName Name,
	SourceLocation NameLoc) const {
	switch (Name.getKind()) {
	case TemplateName::QualifiedTemplate:
	case TemplateName::Template:
	// DNInfo work in progress: CHECKME: what about DNLoc?
	return DeclarationNameInfo(Name.getAsTemplateDecl()->getDeclName(),
	NameLoc);

	case TemplateName::OverloadedTemplate: {
	OverloadedTemplateStorage *Storage = Name.getAsOverloadedTemplate();
	// DNInfo work in progress: CHECKME: what about DNLoc?
	return DeclarationNameInfo((*Storage->begin())->getDeclName(), NameLoc);
	}

	case TemplateName::AssumedTemplate: {
	AssumedTemplateStorage *Storage = Name.getAsAssumedTemplateName();
	return DeclarationNameInfo(Storage->getDeclName(), NameLoc);
	}

	case TemplateName::DependentTemplate: {
	DependentTemplateName *DTN = Name.getAsDependentTemplateName();
	DeclarationName DName;
	if (DTN->isIdentifier()) {
	DName = DeclarationNames.getIdentifier(DTN->getIdentifier());
	return DeclarationNameInfo(DName, NameLoc);
	} else {
	DName = DeclarationNames.getCXXOperatorName(DTN->getOperator());
	// DNInfo work in progress: FIXME: source locations?
	DeclarationNameLoc DNLoc;
	DNLoc.CXXOperatorName.BeginOpNameLoc = SourceLocation().getRawEncoding();
	DNLoc.CXXOperatorName.EndOpNameLoc = SourceLocation().getRawEncoding();
	return DeclarationNameInfo(DName, NameLoc, DNLoc);
	}
	}

	case TemplateName::SubstTemplateTemplateParm: {
	SubstTemplateTemplateParmStorage *subst
	= Name.getAsSubstTemplateTemplateParm();
	return DeclarationNameInfo(subst->getParameter()->getDeclName(),
	NameLoc);
	}

	case TemplateName::SubstTemplateTemplateParmPack: {
	SubstTemplateTemplateParmPackStorage *subst
	= Name.getAsSubstTemplateTemplateParmPack();
	return DeclarationNameInfo(subst->getParameterPack()->getDeclName(),
	NameLoc);
	}
	}

	llvm_unreachable("bad template name kind!");
	}

	TemplateName ASTContext::getCanonicalTemplateName(TemplateName Name) const {
	switch (Name.getKind()) {
	case TemplateName::QualifiedTemplate:
	case TemplateName::Template: {
	TemplateDecl *Template = Name.getAsTemplateDecl();
	if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(Template))
	Template = getCanonicalTemplateTemplateParmDecl(TTP);

	// The canonical template name is the canonical template declaration.
	return TemplateName(cast<TemplateDecl>(Template->getCanonicalDecl()));
	}

	case TemplateName::OverloadedTemplate:
	case TemplateName::AssumedTemplate:
	llvm_unreachable("cannot canonicalize unresolved template");

	case TemplateName::DependentTemplate: {
	DependentTemplateName *DTN = Name.getAsDependentTemplateName();
	assert(DTN && "Non-dependent template names must refer to template decls.");
	return DTN->CanonicalTemplateName;
	}

	case TemplateName::SubstTemplateTemplateParm: {
	SubstTemplateTemplateParmStorage *subst
	= Name.getAsSubstTemplateTemplateParm();
	return getCanonicalTemplateName(subst->getReplacement());
	}

	case TemplateName::SubstTemplateTemplateParmPack: {
	SubstTemplateTemplateParmPackStorage *subst
	= Name.getAsSubstTemplateTemplateParmPack();
	TemplateTemplateParmDecl *canonParameter
	= getCanonicalTemplateTemplateParmDecl(subst->getParameterPack());
	TemplateArgument canonArgPack
	= getCanonicalTemplateArgument(subst->getArgumentPack());
	return getSubstTemplateTemplateParmPack(canonParameter, canonArgPack);
	}
	}

	llvm_unreachable("bad template name!");
	}

	bool ASTContext::hasSameTemplateName(TemplateName X, TemplateName Y) {
	X = getCanonicalTemplateName(X);
	Y = getCanonicalTemplateName(Y);
	return X.getAsVoidPointer() == Y.getAsVoidPointer();
	}

	TemplateArgument
	ASTContext::getCanonicalTemplateArgument(const TemplateArgument &Arg) const {
	switch (Arg.getKind()) {
	case TemplateArgument::Null:
	return Arg;

	case TemplateArgument::Expression:
	return Arg;

	case TemplateArgument::Declaration: {
	auto *D = cast<ValueDecl>(Arg.getAsDecl()->getCanonicalDecl());
	return TemplateArgument(D, Arg.getParamTypeForDecl());
	}

	case TemplateArgument::NullPtr:
	return TemplateArgument(getCanonicalType(Arg.getNullPtrType()),
	/isNullPtr/true);

	case TemplateArgument::Template:
	return TemplateArgument(getCanonicalTemplateName(Arg.getAsTemplate()));

	case TemplateArgument::TemplateExpansion:
	return TemplateArgument(getCanonicalTemplateName(
	Arg.getAsTemplateOrTemplatePattern()),
	Arg.getNumTemplateExpansions());

	case TemplateArgument::Integral:
	return TemplateArgument(Arg, getCanonicalType(Arg.getIntegralType()));

	case TemplateArgument::Type:
	return TemplateArgument(getCanonicalType(Arg.getAsType()));

	case TemplateArgument::Pack: {
	if (Arg.pack_size() == 0)
	return Arg;

	auto CanonArgs = new (this) TemplateArgument[Arg.pack_size()];
	unsigned Idx = 0;
	for (TemplateArgument::pack_iterator A = Arg.pack_begin(),
	AEnd = Arg.pack_end();
	A != AEnd; (void)++A, ++Idx)
	CanonArgs[Idx] = getCanonicalTemplateArgument(*A);

	return TemplateArgument(llvm::makeArrayRef(CanonArgs, Arg.pack_size()));
	}
	}

	// Silence GCC warning
	llvm_unreachable("Unhandled template argument kind");
	}

	NestedNameSpecifier *
	ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
	if (!NNS)
	return nullptr;

	switch (NNS->getKind()) {
	case NestedNameSpecifier::Identifier:
	// Canonicalize the prefix but keep the identifier the same.
	return NestedNameSpecifier::Create(*this,
	getCanonicalNestedNameSpecifier(NNS->getPrefix()),
	NNS->getAsIdentifier());

	case NestedNameSpecifier::Namespace:
	// A namespace is canonical; build a nested-name-specifier with
	// this namespace and no prefix.
	return NestedNameSpecifier::Create(*this, nullptr,
	NNS->getAsNamespace()->getOriginalNamespace());

	case NestedNameSpecifier::NamespaceAlias:
	// A namespace is canonical; build a nested-name-specifier with
	// this namespace and no prefix.
	return NestedNameSpecifier::Create(*this, nullptr,
	NNS->getAsNamespaceAlias()->getNamespace()
	->getOriginalNamespace());

	case NestedNameSpecifier::TypeSpec:
	case NestedNameSpecifier::TypeSpecWithTemplate: {
	QualType T = getCanonicalType(QualType(NNS->getAsType(), 0));

	// If we have some kind of dependent-named type (e.g., "typename T::type"),
	// break it apart into its prefix and identifier, then reconsititute those
	// as the canonical nested-name-specifier. This is required to canonicalize
	// a dependent nested-name-specifier involving typedefs of dependent-name
	// types, e.g.,
	// typedef typename T::type T1;
	// typedef typename T1::type T2;
	if (const auto *DNT = T->getAs<DependentNameType>())
	return NestedNameSpecifier::Create(*this, DNT->getQualifier(),
	const_cast<IdentifierInfo *>(DNT->getIdentifier()));

	// Otherwise, just canonicalize the type, and force it to be a TypeSpec.
	// FIXME: Why are TypeSpec and TypeSpecWithTemplate distinct in the
	// first place?
	return NestedNameSpecifier::Create(*this, nullptr, false,
	const_cast<Type *>(T.getTypePtr()));
	}

	case NestedNameSpecifier::Global:
	case NestedNameSpecifier::Super:
	// The global specifier and __super specifer are canonical and unique.
	return NNS;
	}

	llvm_unreachable("Invalid NestedNameSpecifier::Kind!");
	}

	const ArrayType *ASTContext::getAsArrayType(QualType T) const {
	// Handle the non-qualified case efficiently.
	if (!T.hasLocalQualifiers()) {
	// Handle the common positive case fast.
	if (const auto *AT = dyn_cast<ArrayType>(T))
	return AT;
	}

	// Handle the common negative case fast.
	if (!isa<ArrayType>(T.getCanonicalType()))
	return nullptr;

	// Apply any qualifiers from the array type to the element type. This
	// implements C99 6.7.3p8: "If the specification of an array type includes
	// any type qualifiers, the element type is so qualified, not the array type."

	// If we get here, we either have type qualifiers on the type, or we have
	// sugar such as a typedef in the way. If we have type qualifiers on the type
	// we must propagate them down into the element type.

	SplitQualType split = T.getSplitDesugaredType();
	Qualifiers qs = split.Quals;

	// If we have a simple case, just return now.
	const auto *ATy = dyn_cast<ArrayType>(split.Ty);
	if (!ATy \|\| qs.empty())
	return ATy;

	// Otherwise, we have an array and we have qualifiers on it. Push the
	// qualifiers into the array element type and return a new array type.
	QualType NewEltTy = getQualifiedType(ATy->getElementType(), qs);

	if (const auto *CAT = dyn_cast<ConstantArrayType>(ATy))
	return cast<ArrayType>(getConstantArrayType(NewEltTy, CAT->getSize(),
	CAT->getSizeExpr(),
	CAT->getSizeModifier(),
	CAT->getIndexTypeCVRQualifiers()));
	if (const auto *IAT = dyn_cast<IncompleteArrayType>(ATy))
	return cast<ArrayType>(getIncompleteArrayType(NewEltTy,
	IAT->getSizeModifier(),
	IAT->getIndexTypeCVRQualifiers()));

	if (const auto *DSAT = dyn_cast<DependentSizedArrayType>(ATy))
	return cast<ArrayType>(
	getDependentSizedArrayType(NewEltTy,
	DSAT->getSizeExpr(),
	DSAT->getSizeModifier(),
	DSAT->getIndexTypeCVRQualifiers(),
	DSAT->getBracketsRange()));

	const auto *VAT = cast<VariableArrayType>(ATy);
	return cast<ArrayType>(getVariableArrayType(NewEltTy,
	VAT->getSizeExpr(),
	VAT->getSizeModifier(),
	VAT->getIndexTypeCVRQualifiers(),
	VAT->getBracketsRange()));
	}

	QualType ASTContext::getAdjustedParameterType(QualType T) const {
	if (T->isArrayType() \|\| T->isFunctionType())
	return getDecayedType(T);
	return T;
	}

	QualType ASTContext::getSignatureParameterType(QualType T) const {
	T = getVariableArrayDecayedType(T);
	T = getAdjustedParameterType(T);
	return T.getUnqualifiedType();
	}

	QualType ASTContext::getExceptionObjectType(QualType T) const {
	// C++ [except.throw]p3:
	// A throw-expression initializes a temporary object, called the exception
	// object, the type of which is determined by removing any top-level
	// cv-qualifiers from the static type of the operand of throw and adjusting
	// the type from "array of T" or "function returning T" to "pointer to T"
	// or "pointer to function returning T", [...]
	T = getVariableArrayDecayedType(T);
	if (T->isArrayType() \|\| T->isFunctionType())
	T = getDecayedType(T);
	return T.getUnqualifiedType();
	}

	/// getArrayDecayedType - Return the properly qualified result of decaying the
	/// specified array type to a pointer. This operation is non-trivial when
	/// handling typedefs etc. The canonical type of "T" must be an array type,
	/// this returns a pointer to a properly qualified element of the array.
	///
	/// See C99 6.7.5.3p7 and C99 6.3.2.1p3.
	QualType ASTContext::getArrayDecayedType(QualType Ty) const {
	// Get the element type with 'getAsArrayType' so that we don't lose any
	// typedefs in the element type of the array. This also handles propagation
	// of type qualifiers from the array type into the element type if present
	// (C99 6.7.3p8).
	const ArrayType *PrettyArrayType = getAsArrayType(Ty);
	assert(PrettyArrayType && "Not an array type!");

	QualType PtrTy = getPointerType(PrettyArrayType->getElementType());

	// int x[restrict 4] -> int *restrict
	QualType Result = getQualifiedType(PtrTy,
	PrettyArrayType->getIndexTypeQualifiers());

	// int x[_Nullable] -> int * _Nullable
	if (auto Nullability = Ty->getNullability(*this)) {
	Result = const_cast<ASTContext *>(this)->getAttributedType(
	AttributedType::getNullabilityAttrKind(*Nullability), Result, Result);
	}
	return Result;
	}

	QualType ASTContext::getBaseElementType(const ArrayType *array) const {
	return getBaseElementType(array->getElementType());
	}

	QualType ASTContext::getBaseElementType(QualType type) const {
	Qualifiers qs;
	while (true) {
	SplitQualType split = type.getSplitDesugaredType();
	const ArrayType *array = split.Ty->getAsArrayTypeUnsafe();
	if (!array) break;

	type = array->getElementType();
	qs.addConsistentQualifiers(split.Quals);
	}

	return getQualifiedType(type, qs);
	}

	/// getConstantArrayElementCount - Returns number of constant array elements.
	uint64_t
	ASTContext::getConstantArrayElementCount(const ConstantArrayType *CA) const {
	uint64_t ElementCount = 1;
	do {
	ElementCount *= CA->getSize().getZExtValue();
	CA = dyn_cast_or_null<ConstantArrayType>(
	CA->getElementType()->getAsArrayTypeUnsafe());
	} while (CA);
	return ElementCount;
	}

	/// getFloatingRank - Return a relative rank for floating point types.
	/// This routine will assert if passed a built-in type that isn't a float.
	static FloatingRank getFloatingRank(QualType T) {
	if (const auto *CT = T->getAs<ComplexType>())
	return getFloatingRank(CT->getElementType());

	switch (T->castAs<BuiltinType>()->getKind()) {
	default: llvm_unreachable("getFloatingRank(): not a floating type");
	case BuiltinType::Float16: return Float16Rank;
	case BuiltinType::Half: return HalfRank;
	case BuiltinType::Float: return FloatRank;
	case BuiltinType::Double: return DoubleRank;
	case BuiltinType::LongDouble: return LongDoubleRank;
	case BuiltinType::Float128: return Float128Rank;
	case BuiltinType::BFloat16: return BFloat16Rank;
	}
	}

	/// getFloatingTypeOfSizeWithinDomain - Returns a real floating
	/// point or a complex type (based on typeDomain/typeSize).
	/// 'typeDomain' is a real floating point or complex type.
	/// 'typeSize' is a real floating point or complex type.
	QualType ASTContext::getFloatingTypeOfSizeWithinDomain(QualType Size,
	QualType Domain) const {
	FloatingRank EltRank = getFloatingRank(Size);
	if (Domain->isComplexType()) {
	switch (EltRank) {
	case BFloat16Rank: llvm_unreachable("Complex bfloat16 is not supported");
	case Float16Rank:
	case HalfRank: llvm_unreachable("Complex half is not supported");
	case FloatRank: return FloatComplexTy;
	case DoubleRank: return DoubleComplexTy;
	case LongDoubleRank: return LongDoubleComplexTy;
	case Float128Rank: return Float128ComplexTy;
	}
	}

	assert(Domain->isRealFloatingType() && "Unknown domain!");
	switch (EltRank) {
	case Float16Rank: return HalfTy;
	case BFloat16Rank: return BFloat16Ty;
	case HalfRank: return HalfTy;
	case FloatRank: return FloatTy;
	case DoubleRank: return DoubleTy;
	case LongDoubleRank: return LongDoubleTy;
	case Float128Rank: return Float128Ty;
	}
	llvm_unreachable("getFloatingRank(): illegal value for rank");
	}

	/// getFloatingTypeOrder - Compare the rank of the two specified floating
	/// point types, ignoring the domain of the type (i.e. 'double' ==
	/// '_Complex double'). If LHS > RHS, return 1. If LHS == RHS, return 0. If
	/// LHS < RHS, return -1.
	int ASTContext::getFloatingTypeOrder(QualType LHS, QualType RHS) const {
	FloatingRank LHSR = getFloatingRank(LHS);
	FloatingRank RHSR = getFloatingRank(RHS);

	if (LHSR == RHSR)
	return 0;
	if (LHSR > RHSR)
	return 1;
	return -1;
	}

	int ASTContext::getFloatingTypeSemanticOrder(QualType LHS, QualType RHS) const {
	if (&getFloatTypeSemantics(LHS) == &getFloatTypeSemantics(RHS))
	return 0;
	return getFloatingTypeOrder(LHS, RHS);
	}

	/// getIntegerRank - Return an integer conversion rank (C99 6.3.1.1p1). This
	/// routine will assert if passed a built-in type that isn't an integer or enum,
	/// or if it is not canonicalized.
	unsigned ASTContext::getIntegerRank(const Type *T) const {
	assert(T->isCanonicalUnqualified() && "T should be canonicalized");

	// Results in this 'losing' to any type of the same size, but winning if
	// larger.
	if (const auto *EIT = dyn_cast<ExtIntType>(T))
	return 0 + (EIT->getNumBits() << 3);

	switch (cast<BuiltinType>(T)->getKind()) {
	default: llvm_unreachable("getIntegerRank(): not a built-in integer");
	case BuiltinType::Bool:
	return 1 + (getIntWidth(BoolTy) << 3);
	case BuiltinType::Char_S:
	case BuiltinType::Char_U:
	case BuiltinType::SChar:
	case BuiltinType::UChar:
	return 2 + (getIntWidth(CharTy) << 3);
	case BuiltinType::Short:
	case BuiltinType::UShort:
	return 3 + (getIntWidth(ShortTy) << 3);
	case BuiltinType::Int:
	case BuiltinType::UInt:
	return 4 + (getIntWidth(IntTy) << 3);
	case BuiltinType::Long:
	case BuiltinType::ULong:
	return 5 + (getIntWidth(LongTy) << 3);
	case BuiltinType::LongLong:
	case BuiltinType::ULongLong:
	return 6 + (getIntWidth(LongLongTy) << 3);
	case BuiltinType::Int128:
	case BuiltinType::UInt128:
	return 7 + (getIntWidth(Int128Ty) << 3);
	}
	}

	/// Whether this is a promotable bitfield reference according
	/// to C99 6.3.1.1p2, bullet 2 (and GCC extensions).
	///
	/// \returns the type this bit-field will promote to, or NULL if no
	/// promotion occurs.
	QualType ASTContext::isPromotableBitField(Expr *E) const {
	if (E->isTypeDependent() \|\| E->isValueDependent())
	return {};

	// C++ [conv.prom]p5:
	// If the bit-field has an enumerated type, it is treated as any other
	// value of that type for promotion purposes.
	if (getLangOpts().CPlusPlus && E->getType()->isEnumeralType())
	return {};

	// FIXME: We should not do this unless E->refersToBitField() is true. This
	// matters in C where getSourceBitField() will find bit-fields for various
	// cases where the source expression is not a bit-field designator.

	FieldDecl *Field = E->getSourceBitField(); // FIXME: conditional bit-fields?
	if (!Field)
	return {};

	QualType FT = Field->getType();

	uint64_t BitWidth = Field->getBitWidthValue(*this);
	uint64_t IntSize = getTypeSize(IntTy);
	// C++ [conv.prom]p5:
	// A prvalue for an integral bit-field can be converted to a prvalue of type
	// int if int can represent all the values of the bit-field; otherwise, it
	// can be converted to unsigned int if unsigned int can represent all the
	// values of the bit-field. If the bit-field is larger yet, no integral
	// promotion applies to it.
	// C11 6.3.1.1/2:
	// [For a bit-field of type _Bool, int, signed int, or unsigned int:]
	// If an int can represent all values of the original type (as restricted by
	// the width, for a bit-field), the value is converted to an int; otherwise,
	// it is converted to an unsigned int.
	//
	// FIXME: C does not permit promotion of a 'long : 3' bitfield to int.
	// We perform that promotion here to match GCC and C++.
	// FIXME: C does not permit promotion of an enum bit-field whose rank is
	// greater than that of 'int'. We perform that promotion to match GCC.
	if (BitWidth < IntSize)
	return IntTy;

	if (BitWidth == IntSize)
	return FT->isSignedIntegerType() ? IntTy : UnsignedIntTy;

	// Bit-fields wider than int are not subject to promotions, and therefore act
	// like the base type. GCC has some weird bugs in this area that we
	// deliberately do not follow (GCC follows a pre-standard resolution to
	// C's DR315 which treats bit-width as being part of the type, and this leaks
	// into their semantics in some cases).
	return {};
	}

	/// getPromotedIntegerType - Returns the type that Promotable will
	/// promote to: C99 6.3.1.1p2, assuming that Promotable is a promotable
	/// integer type.
	QualType ASTContext::getPromotedIntegerType(QualType Promotable) const {
	assert(!Promotable.isNull());
	assert(Promotable->isPromotableIntegerType());
	if (const auto *ET = Promotable->getAs<EnumType>())
	return ET->getDecl()->getPromotionType();

	if (const auto *BT = Promotable->getAs<BuiltinType>()) {
	// C++ [conv.prom]: A prvalue of type char16_t, char32_t, or wchar_t
	// (3.9.1) can be converted to a prvalue of the first of the following
	// types that can represent all the values of its underlying type:
	// int, unsigned int, long int, unsigned long int, long long int, or
	// unsigned long long int [...]
	// FIXME: Is there some better way to compute this?
	if (BT->getKind() == BuiltinType::WChar_S \|\|
	BT->getKind() == BuiltinType::WChar_U \|\|
	BT->getKind() == BuiltinType::Char8 \|\|
	BT->getKind() == BuiltinType::Char16 \|\|
	BT->getKind() == BuiltinType::Char32) {
	bool FromIsSigned = BT->getKind() == BuiltinType::WChar_S;
	uint64_t FromSize = getTypeSize(BT);
	QualType PromoteTypes[] = { IntTy, UnsignedIntTy, LongTy, UnsignedLongTy,
	LongLongTy, UnsignedLongLongTy };
	for (size_t Idx = 0; Idx < llvm::array_lengthof(PromoteTypes); ++Idx) {
	uint64_t ToSize = getTypeSize(PromoteTypes[Idx]);
	if (FromSize < ToSize \|\|
	(FromSize == ToSize &&
	FromIsSigned == PromoteTypes[Idx]->isSignedIntegerType()))
	return PromoteTypes[Idx];
	}
	llvm_unreachable("char type should fit into long long");
	}
	}

	// At this point, we should have a signed or unsigned integer type.
	if (Promotable->isSignedIntegerType())
	return IntTy;
	uint64_t PromotableSize = getIntWidth(Promotable);
	uint64_t IntSize = getIntWidth(IntTy);
	assert(Promotable->isUnsignedIntegerType() && PromotableSize <= IntSize);
	return (PromotableSize != IntSize) ? IntTy : UnsignedIntTy;
	}

	/// Recurses in pointer/array types until it finds an objc retainable
	/// type and returns its ownership.
	Qualifiers::ObjCLifetime ASTContext::getInnerObjCOwnership(QualType T) const {
	while (!T.isNull()) {
	if (T.getObjCLifetime() != Qualifiers::OCL_None)
	return T.getObjCLifetime();
	if (T->isArrayType())
	T = getBaseElementType(T);
	else if (const auto *PT = T->getAs<PointerType>())
	T = PT->getPointeeType();
	else if (const auto *RT = T->getAs<ReferenceType>())
	T = RT->getPointeeType();
	else
	break;
	}

	return Qualifiers::OCL_None;
	}

	static const Type getIntegerTypeForEnum(const EnumType ET) {
	// Incomplete enum types are not treated as integer types.
	// FIXME: In C++, enum types are never integer types.
	if (ET->getDecl()->isComplete() && !ET->getDecl()->isScoped())
	return ET->getDecl()->getIntegerType().getTypePtr();
	return nullptr;
	}

	/// getIntegerTypeOrder - Returns the highest ranked integer type:
	/// C99 6.3.1.8p1. If LHS > RHS, return 1. If LHS == RHS, return 0. If
	/// LHS < RHS, return -1.
	int ASTContext::getIntegerTypeOrder(QualType LHS, QualType RHS) const {
	const Type *LHSC = getCanonicalType(LHS).getTypePtr();
	const Type *RHSC = getCanonicalType(RHS).getTypePtr();

	// Unwrap enums to their underlying type.
	if (const auto *ET = dyn_cast<EnumType>(LHSC))
	LHSC = getIntegerTypeForEnum(ET);
	if (const auto *ET = dyn_cast<EnumType>(RHSC))
	RHSC = getIntegerTypeForEnum(ET);

	if (LHSC == RHSC) return 0;

	bool LHSUnsigned = LHSC->isUnsignedIntegerType();
	bool RHSUnsigned = RHSC->isUnsignedIntegerType();

	unsigned LHSRank = getIntegerRank(LHSC);
	unsigned RHSRank = getIntegerRank(RHSC);

	if (LHSUnsigned == RHSUnsigned) { // Both signed or both unsigned.
	if (LHSRank == RHSRank) return 0;
	return LHSRank > RHSRank ? 1 : -1;
	}

	// Otherwise, the LHS is signed and the RHS is unsigned or visa versa.
	if (LHSUnsigned) {
	// If the unsigned [LHS] type is larger, return it.
	if (LHSRank >= RHSRank)
	return 1;

	// If the signed type can represent all values of the unsigned type, it
	// wins. Because we are dealing with 2's complement and types that are
	// powers of two larger than each other, this is always safe.
	return -1;
	}

	// If the unsigned [RHS] type is larger, return it.
	if (RHSRank >= LHSRank)
	return -1;

	// If the signed type can represent all values of the unsigned type, it
	// wins. Because we are dealing with 2's complement and types that are
	// powers of two larger than each other, this is always safe.
	return 1;
	}

	TypedefDecl *ASTContext::getCFConstantStringDecl() const {
	if (CFConstantStringTypeDecl)
	return CFConstantStringTypeDecl;

	assert(!CFConstantStringTagDecl &&
	"tag and typedef should be initialized together");
	CFConstantStringTagDecl = buildImplicitRecord("__NSConstantString_tag");
	CFConstantStringTagDecl->startDefinition();

	struct {
	QualType Type;
	const char *Name;
	} Fields[5];
	unsigned Count = 0;

	/// Objective-C ABI
	///
	/// typedef struct __NSConstantString_tag {
	/// const int *isa;
	/// int flags;
	/// const char *str;
	/// long length;
	/// } __NSConstantString;
	///
	/// Swift ABI (4.1, 4.2)
	///
	/// typedef struct __NSConstantString_tag {
	/// uintptr_t _cfisa;
	/// uintptr_t _swift_rc;
	/// _Atomic(uint64_t) _cfinfoa;
	/// const char *_ptr;
	/// uint32_t _length;
	/// } __NSConstantString;
	///
	/// Swift ABI (5.0)
	///
	/// typedef struct __NSConstantString_tag {
	/// uintptr_t _cfisa;
	/// uintptr_t _swift_rc;
	/// _Atomic(uint64_t) _cfinfoa;
	/// const char *_ptr;
	/// uintptr_t _length;
	/// } __NSConstantString;

	const auto CFRuntime = getLangOpts().CFRuntime;
	if (static_cast<unsigned>(CFRuntime) <
	static_cast<unsigned>(LangOptions::CoreFoundationABI::Swift)) {
	Fields[Count++] = { getPointerType(IntTy.withConst()), "isa" };
	Fields[Count++] = { IntTy, "flags" };
	Fields[Count++] = { getPointerType(CharTy.withConst()), "str" };
	Fields[Count++] = { LongTy, "length" };
	} else {
	Fields[Count++] = { getUIntPtrType(), "_cfisa" };
	Fields[Count++] = { getUIntPtrType(), "_swift_rc" };
	Fields[Count++] = { getFromTargetType(Target->getUInt64Type()), "_swift_rc" };
	Fields[Count++] = { getPointerType(CharTy.withConst()), "_ptr" };
	if (CFRuntime == LangOptions::CoreFoundationABI::Swift4_1 \|\|
	CFRuntime == LangOptions::CoreFoundationABI::Swift4_2)
	Fields[Count++] = { IntTy, "_ptr" };
	else
	Fields[Count++] = { getUIntPtrType(), "_ptr" };
	}

	// Create fields
	for (unsigned i = 0; i < Count; ++i) {
	FieldDecl *Field =
	FieldDecl::Create(*this, CFConstantStringTagDecl, SourceLocation(),
	SourceLocation(), &Idents.get(Fields[i].Name),
	Fields[i].Type, /TInfo=/nullptr,
	/BitWidth=/nullptr, /Mutable=/false, ICIS_NoInit);
	Field->setAccess(AS_public);
	CFConstantStringTagDecl->addDecl(Field);
	}

	CFConstantStringTagDecl->completeDefinition();
	// This type is designed to be compatible with NSConstantString, but cannot
	// use the same name, since NSConstantString is an interface.
	auto tagType = getTagDeclType(CFConstantStringTagDecl);
	CFConstantStringTypeDecl =
	buildImplicitTypedef(tagType, "__NSConstantString");

	return CFConstantStringTypeDecl;
	}

	RecordDecl *ASTContext::getCFConstantStringTagDecl() const {
	if (!CFConstantStringTagDecl)
	getCFConstantStringDecl(); // Build the tag and the typedef.
	return CFConstantStringTagDecl;
	}

	// getCFConstantStringType - Return the type used for constant CFStrings.
	QualType ASTContext::getCFConstantStringType() const {
	return getTypedefType(getCFConstantStringDecl());
	}

	QualType ASTContext::getObjCSuperType() const {
	if (ObjCSuperType.isNull()) {
	RecordDecl *ObjCSuperTypeDecl = buildImplicitRecord("objc_super");
	TUDecl->addDecl(ObjCSuperTypeDecl);
	ObjCSuperType = getTagDeclType(ObjCSuperTypeDecl);
	}
	return ObjCSuperType;
	}

	void ASTContext::setCFConstantStringType(QualType T) {
	const auto *TD = T->castAs<TypedefType>();
	CFConstantStringTypeDecl = cast<TypedefDecl>(TD->getDecl());
	const auto *TagType =
	CFConstantStringTypeDecl->getUnderlyingType()->castAs<RecordType>();
	CFConstantStringTagDecl = TagType->getDecl();
	}

	QualType ASTContext::getBlockDescriptorType() const {
	if (BlockDescriptorType)
	return getTagDeclType(BlockDescriptorType);

	RecordDecl *RD;
	// FIXME: Needs the FlagAppleBlock bit.
	RD = buildImplicitRecord("__block_descriptor");
	RD->startDefinition();

	QualType FieldTypes[] = {
	UnsignedLongTy,
	UnsignedLongTy,
	};

	static const char *const FieldNames[] = {
	"reserved",
	"Size"
	};

	for (size_t i = 0; i < 2; ++i) {
	FieldDecl *Field = FieldDecl::Create(
	*this, RD, SourceLocation(), SourceLocation(),
	&Idents.get(FieldNames[i]), FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr, /Mutable=/false, ICIS_NoInit);
	Field->setAccess(AS_public);
	RD->addDecl(Field);
	}

	RD->completeDefinition();

	BlockDescriptorType = RD;

	return getTagDeclType(BlockDescriptorType);
	}

	QualType ASTContext::getBlockDescriptorExtendedType() const {
	if (BlockDescriptorExtendedType)
	return getTagDeclType(BlockDescriptorExtendedType);

	RecordDecl *RD;
	// FIXME: Needs the FlagAppleBlock bit.
	RD = buildImplicitRecord("__block_descriptor_withcopydispose");
	RD->startDefinition();

	QualType FieldTypes[] = {
	UnsignedLongTy,
	UnsignedLongTy,
	getPointerType(VoidPtrTy),
	getPointerType(VoidPtrTy)
	};

	static const char *const FieldNames[] = {
	"reserved",
	"Size",
	"CopyFuncPtr",
	"DestroyFuncPtr"
	};

	for (size_t i = 0; i < 4; ++i) {
	FieldDecl *Field = FieldDecl::Create(
	*this, RD, SourceLocation(), SourceLocation(),
	&Idents.get(FieldNames[i]), FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false, ICIS_NoInit);
	Field->setAccess(AS_public);
	RD->addDecl(Field);
	}

	RD->completeDefinition();

	BlockDescriptorExtendedType = RD;
	return getTagDeclType(BlockDescriptorExtendedType);
	}

	OpenCLTypeKind ASTContext::getOpenCLTypeKind(const Type *T) const {
	const auto *BT = dyn_cast<BuiltinType>(T);

	if (!BT) {
	if (isa<PipeType>(T))
	return OCLTK_Pipe;

	return OCLTK_Default;
	}

	switch (BT->getKind()) {
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id: \
	return OCLTK_Image;
	#include "clang/Basic/OpenCLImageTypes.def"

	case BuiltinType::OCLClkEvent:
	return OCLTK_ClkEvent;

	case BuiltinType::OCLEvent:
	return OCLTK_Event;

	case BuiltinType::OCLQueue:
	return OCLTK_Queue;

	case BuiltinType::OCLReserveID:
	return OCLTK_ReserveID;

	case BuiltinType::OCLSampler:
	return OCLTK_Sampler;

	default:
	return OCLTK_Default;
	}
	}

	LangAS ASTContext::getOpenCLTypeAddrSpace(const Type *T) const {
	return Target->getOpenCLTypeAddrSpace(getOpenCLTypeKind(T));
	}

	/// BlockRequiresCopying - Returns true if byref variable "D" of type "Ty"
	/// requires copy/dispose. Note that this must match the logic
	/// in buildByrefHelpers.
	bool ASTContext::BlockRequiresCopying(QualType Ty,
	const VarDecl *D) {
	if (const CXXRecordDecl *record = Ty->getAsCXXRecordDecl()) {
	const Expr *copyExpr = getBlockVarCopyInit(D).getCopyExpr();
	if (!copyExpr && record->hasTrivialDestructor()) return false;

	return true;
	}

	// The block needs copy/destroy helpers if Ty is non-trivial to destructively
	// move or destroy.
	if (Ty.isNonTrivialToPrimitiveDestructiveMove() \|\| Ty.isDestructedType())
	return true;

	if (!Ty->isObjCRetainableType()) return false;

	Qualifiers qs = Ty.getQualifiers();

	// If we have lifetime, that dominates.
	if (Qualifiers::ObjCLifetime lifetime = qs.getObjCLifetime()) {
	switch (lifetime) {
	case Qualifiers::OCL_None: llvm_unreachable("impossible");

	// These are just bits as far as the runtime is concerned.
	case Qualifiers::OCL_ExplicitNone:
	case Qualifiers::OCL_Autoreleasing:
	return false;

	// These cases should have been taken care of when checking the type's
	// non-triviality.
	case Qualifiers::OCL_Weak:
	case Qualifiers::OCL_Strong:
	llvm_unreachable("impossible");
	}
	llvm_unreachable("fell out of lifetime switch!");
	}
	return (Ty->isBlockPointerType() \|\| isObjCNSObjectType(Ty) \|\|
	Ty->isObjCObjectPointerType());
	}

	bool ASTContext::getByrefLifetime(QualType Ty,
	Qualifiers::ObjCLifetime &LifeTime,
	bool &HasByrefExtendedLayout) const {
	if (!getLangOpts().ObjC \|\|
	getLangOpts().getGC() != LangOptions::NonGC)
	return false;

	HasByrefExtendedLayout = false;
	if (Ty->isRecordType()) {
	HasByrefExtendedLayout = true;
	LifeTime = Qualifiers::OCL_None;
	} else if ((LifeTime = Ty.getObjCLifetime())) {
	// Honor the ARC qualifiers.
	} else if (Ty->isObjCObjectPointerType() \|\| Ty->isBlockPointerType()) {
	// The MRR rule.
	LifeTime = Qualifiers::OCL_ExplicitNone;
	} else {
	LifeTime = Qualifiers::OCL_None;
	}
	return true;
	}

	CanQualType ASTContext::getNSUIntegerType() const {
	assert(Target && "Expected target to be initialized");
	const llvm::Triple &T = Target->getTriple();
	// Windows is LLP64 rather than LP64
	if (T.isOSWindows() && T.isArch64Bit())
	return UnsignedLongLongTy;
	return UnsignedLongTy;
	}

	CanQualType ASTContext::getNSIntegerType() const {
	assert(Target && "Expected target to be initialized");
	const llvm::Triple &T = Target->getTriple();
	// Windows is LLP64 rather than LP64
	if (T.isOSWindows() && T.isArch64Bit())
	return LongLongTy;
	return LongTy;
	}

	TypedefDecl *ASTContext::getObjCInstanceTypeDecl() {
	if (!ObjCInstanceTypeDecl)
	ObjCInstanceTypeDecl =
	buildImplicitTypedef(getObjCIdType(), "instancetype");
	return ObjCInstanceTypeDecl;
	}

	// This returns true if a type has been typedefed to BOOL:
	// typedef <type> BOOL;
	static bool isTypeTypedefedAsBOOL(QualType T) {
	if (const auto *TT = dyn_cast<TypedefType>(T))
	if (IdentifierInfo *II = TT->getDecl()->getIdentifier())
	return II->isStr("BOOL");

	return false;
	}

	/// getObjCEncodingTypeSize returns size of type for objective-c encoding
	/// purpose.
	CharUnits ASTContext::getObjCEncodingTypeSize(QualType type) const {
	if (!type->isIncompleteArrayType() && type->isIncompleteType())
	return CharUnits::Zero();

	CharUnits sz = getTypeSizeInChars(type);

	// Make all integer and enum types at least as large as an int
	if (sz.isPositive() && type->isIntegralOrEnumerationType())
	sz = std::max(sz, getTypeSizeInChars(IntTy));
	// Treat arrays as pointers, since that's how they're passed in.
	else if (type->isArrayType())
	sz = getTypeSizeInChars(VoidPtrTy);
	return sz;
	}

	bool ASTContext::isMSStaticDataMemberInlineDefinition(const VarDecl *VD) const {
	return getTargetInfo().getCXXABI().isMicrosoft() &&
	VD->isStaticDataMember() &&
	VD->getType()->isIntegralOrEnumerationType() &&
	!VD->getFirstDecl()->isOutOfLine() && VD->getFirstDecl()->hasInit();
	}

	ASTContext::InlineVariableDefinitionKind
	ASTContext::getInlineVariableDefinitionKind(const VarDecl *VD) const {
	if (!VD->isInline())
	return InlineVariableDefinitionKind::None;

	// In almost all cases, it's a weak definition.
	auto *First = VD->getFirstDecl();
	if (First->isInlineSpecified() \|\| !First->isStaticDataMember())
	return InlineVariableDefinitionKind::Weak;

	// If there's a file-context declaration in this translation unit, it's a
	// non-discardable definition.
	for (auto *D : VD->redecls())
	if (D->getLexicalDeclContext()->isFileContext() &&
	!D->isInlineSpecified() && (D->isConstexpr() \|\| First->isConstexpr()))
	return InlineVariableDefinitionKind::Strong;

	// If we've not seen one yet, we don't know.
	return InlineVariableDefinitionKind::WeakUnknown;
	}

	static std::string charUnitsToString(const CharUnits &CU) {
	return llvm::itostr(CU.getQuantity());
	}

	/// getObjCEncodingForBlock - Return the encoded type for this block
	/// declaration.
	std::string ASTContext::getObjCEncodingForBlock(const BlockExpr *Expr) const {
	std::string S;

	const BlockDecl *Decl = Expr->getBlockDecl();
	QualType BlockTy =
	Expr->getType()->castAs<BlockPointerType>()->getPointeeType();
	QualType BlockReturnTy = BlockTy->castAs<FunctionType>()->getReturnType();
	// Encode result type.
	if (getLangOpts().EncodeExtendedBlockSig)
	getObjCEncodingForMethodParameter(Decl::OBJC_TQ_None, BlockReturnTy, S,
	true /Extended/);
	else
	getObjCEncodingForType(BlockReturnTy, S);
	// Compute size of all parameters.
	// Start with computing size of a pointer in number of bytes.
	// FIXME: There might(should) be a better way of doing this computation!
	CharUnits PtrSize = getTypeSizeInChars(VoidPtrTy);
	CharUnits ParmOffset = PtrSize;
	for (auto PI : Decl->parameters()) {
	QualType PType = PI->getType();
	CharUnits sz = getObjCEncodingTypeSize(PType);
	if (sz.isZero())
	continue;
	assert(sz.isPositive() && "BlockExpr - Incomplete param type");
	ParmOffset += sz;
	}
	// Size of the argument frame
	S += charUnitsToString(ParmOffset);
	// Block pointer and offset.
	S += "@?0";

	// Argument types.
	ParmOffset = PtrSize;
	for (auto PVDecl : Decl->parameters()) {
	QualType PType = PVDecl->getOriginalType();
	if (const auto *AT =
	dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
	// Use array's original type only if it has known number of
	// elements.
	if (!isa<ConstantArrayType>(AT))
	PType = PVDecl->getType();
	} else if (PType->isFunctionType())
	PType = PVDecl->getType();
	if (getLangOpts().EncodeExtendedBlockSig)
	getObjCEncodingForMethodParameter(Decl::OBJC_TQ_None, PType,
	S, true /Extended/);
	else
	getObjCEncodingForType(PType, S);
	S += charUnitsToString(ParmOffset);
	ParmOffset += getObjCEncodingTypeSize(PType);
	}

	return S;
	}

	std::string
	ASTContext::getObjCEncodingForFunctionDecl(const FunctionDecl *Decl) const {
	std::string S;
	// Encode result type.
	getObjCEncodingForType(Decl->getReturnType(), S);
	CharUnits ParmOffset;
	// Compute size of all parameters.
	for (auto PI : Decl->parameters()) {
	QualType PType = PI->getType();
	CharUnits sz = getObjCEncodingTypeSize(PType);
	if (sz.isZero())
	continue;

	assert(sz.isPositive() &&
	"getObjCEncodingForFunctionDecl - Incomplete param type");
	ParmOffset += sz;
	}
	S += charUnitsToString(ParmOffset);
	ParmOffset = CharUnits::Zero();

	// Argument types.
	for (auto PVDecl : Decl->parameters()) {
	QualType PType = PVDecl->getOriginalType();
	if (const auto *AT =
	dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
	// Use array's original type only if it has known number of
	// elements.
	if (!isa<ConstantArrayType>(AT))
	PType = PVDecl->getType();
	} else if (PType->isFunctionType())
	PType = PVDecl->getType();
	getObjCEncodingForType(PType, S);
	S += charUnitsToString(ParmOffset);
	ParmOffset += getObjCEncodingTypeSize(PType);
	}

	return S;
	}

	/// getObjCEncodingForMethodParameter - Return the encoded type for a single
	/// method parameter or return type. If Extended, include class names and
	/// block object types.
	void ASTContext::getObjCEncodingForMethodParameter(Decl::ObjCDeclQualifier QT,
	QualType T, std::string& S,
	bool Extended) const {
	// Encode type qualifer, 'in', 'inout', etc. for the parameter.
	getObjCEncodingForTypeQualifier(QT, S);
	// Encode parameter type.
	ObjCEncOptions Options = ObjCEncOptions()
	.setExpandPointedToStructures()
	.setExpandStructures()
	.setIsOutermostType();
	if (Extended)
	Options.setEncodeBlockParameters().setEncodeClassNames();
	getObjCEncodingForTypeImpl(T, S, Options, /Field=/nullptr);
	}

	/// getObjCEncodingForMethodDecl - Return the encoded type for this method
	/// declaration.
	std::string ASTContext::getObjCEncodingForMethodDecl(const ObjCMethodDecl *Decl,
	bool Extended) const {
	// FIXME: This is not very efficient.
	// Encode return type.
	std::string S;
	getObjCEncodingForMethodParameter(Decl->getObjCDeclQualifier(),
	Decl->getReturnType(), S, Extended);
	// Compute size of all parameters.
	// Start with computing size of a pointer in number of bytes.
	// FIXME: There might(should) be a better way of doing this computation!
	CharUnits PtrSize = getTypeSizeInChars(VoidPtrTy);
	// The first two arguments (self and _cmd) are pointers; account for
	// their size.
	CharUnits ParmOffset = 2 * PtrSize;
	for (ObjCMethodDecl::param_const_iterator PI = Decl->param_begin(),
	E = Decl->sel_param_end(); PI != E; ++PI) {
	QualType PType = (*PI)->getType();
	CharUnits sz = getObjCEncodingTypeSize(PType);
	if (sz.isZero())
	continue;

	assert(sz.isPositive() &&
	"getObjCEncodingForMethodDecl - Incomplete param type");
	ParmOffset += sz;
	}
	S += charUnitsToString(ParmOffset);
	S += "@0:";
	S += charUnitsToString(PtrSize);

	// Argument types.
	ParmOffset = 2 * PtrSize;
	for (ObjCMethodDecl::param_const_iterator PI = Decl->param_begin(),
	E = Decl->sel_param_end(); PI != E; ++PI) {
	const ParmVarDecl PVDecl = PI;
	QualType PType = PVDecl->getOriginalType();
	if (const auto *AT =
	dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
	// Use array's original type only if it has known number of
	// elements.
	if (!isa<ConstantArrayType>(AT))
	PType = PVDecl->getType();
	} else if (PType->isFunctionType())
	PType = PVDecl->getType();
	getObjCEncodingForMethodParameter(PVDecl->getObjCDeclQualifier(),
	PType, S, Extended);
	S += charUnitsToString(ParmOffset);
	ParmOffset += getObjCEncodingTypeSize(PType);
	}

	return S;
	}

	ObjCPropertyImplDecl *
	ASTContext::getObjCPropertyImplDeclForPropertyDecl(
	const ObjCPropertyDecl *PD,
	const Decl *Container) const {
	if (!Container)
	return nullptr;
	if (const auto *CID = dyn_cast<ObjCCategoryImplDecl>(Container)) {
	for (auto *PID : CID->property_impls())
	if (PID->getPropertyDecl() == PD)
	return PID;
	} else {
	const auto *OID = cast<ObjCImplementationDecl>(Container);
	for (auto *PID : OID->property_impls())
	if (PID->getPropertyDecl() == PD)
	return PID;
	}
	return nullptr;
	}

	/// getObjCEncodingForPropertyDecl - Return the encoded type for this
	/// property declaration. If non-NULL, Container must be either an
	/// ObjCCategoryImplDecl or ObjCImplementationDecl; it should only be
	/// NULL when getting encodings for protocol properties.
	/// Property attributes are stored as a comma-delimited C string. The simple
	/// attributes readonly and bycopy are encoded as single characters. The
	/// parametrized attributes, getter=name, setter=name, and ivar=name, are
	/// encoded as single characters, followed by an identifier. Property types
	/// are also encoded as a parametrized attribute. The characters used to encode
	/// these attributes are defined by the following enumeration:
	/// @code
	/// enum PropertyAttributes {
	/// kPropertyReadOnly = 'R', // property is read-only.
	/// kPropertyBycopy = 'C', // property is a copy of the value last assigned
	/// kPropertyByref = '&', // property is a reference to the value last assigned
	/// kPropertyDynamic = 'D', // property is dynamic
	/// kPropertyGetter = 'G', // followed by getter selector name
	/// kPropertySetter = 'S', // followed by setter selector name
	/// kPropertyInstanceVariable = 'V' // followed by instance variable name
	/// kPropertyType = 'T' // followed by old-style type encoding.
	/// kPropertyWeak = 'W' // 'weak' property
	/// kPropertyStrong = 'P' // property GC'able
	/// kPropertyNonAtomic = 'N' // property non-atomic
	/// };
	/// @endcode
	std::string
	ASTContext::getObjCEncodingForPropertyDecl(const ObjCPropertyDecl *PD,
	const Decl *Container) const {
	// Collect information from the property implementation decl(s).
	bool Dynamic = false;
	ObjCPropertyImplDecl *SynthesizePID = nullptr;

	if (ObjCPropertyImplDecl *PropertyImpDecl =
	getObjCPropertyImplDeclForPropertyDecl(PD, Container)) {
	if (PropertyImpDecl->getPropertyImplementation() == ObjCPropertyImplDecl::Dynamic)
	Dynamic = true;
	else
	SynthesizePID = PropertyImpDecl;
	}

	// FIXME: This is not very efficient.
	std::string S = "T";

	// Encode result type.
	// GCC has some special rules regarding encoding of properties which
	// closely resembles encoding of ivars.
	getObjCEncodingForPropertyType(PD->getType(), S);

	if (PD->isReadOnly()) {
	S += ",R";
	if (PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_copy)
	S += ",C";
	if (PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_retain)
	S += ",&";
	if (PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_weak)
	S += ",W";
	} else {
	switch (PD->getSetterKind()) {
	case ObjCPropertyDecl::Assign: break;
	case ObjCPropertyDecl::Copy: S += ",C"; break;
	case ObjCPropertyDecl::Retain: S += ",&"; break;
	case ObjCPropertyDecl::Weak: S += ",W"; break;
	}
	}

	// It really isn't clear at all what this means, since properties
	// are "dynamic by default".
	if (Dynamic)
	S += ",D";

	if (PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_nonatomic)
	S += ",N";

	if (PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_getter) {
	S += ",G";
	S += PD->getGetterName().getAsString();
	}

	if (PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_setter) {
	S += ",S";
	S += PD->getSetterName().getAsString();
	}

	if (SynthesizePID) {
	const ObjCIvarDecl *OID = SynthesizePID->getPropertyIvarDecl();
	S += ",V";
	S += OID->getNameAsString();
	}

	// FIXME: OBJCGC: weak & strong
	return S;
	}

	/// getLegacyIntegralTypeEncoding -
	/// Another legacy compatibility encoding: 32-bit longs are encoded as
	/// 'l' or 'L' , but not always. For typedefs, we need to use
	/// 'i' or 'I' instead if encoding a struct field, or a pointer!
	void ASTContext::getLegacyIntegralTypeEncoding (QualType &PointeeTy) const {
	if (isa<TypedefType>(PointeeTy.getTypePtr())) {
	if (const auto *BT = PointeeTy->getAs<BuiltinType>()) {
	if (BT->getKind() == BuiltinType::ULong && getIntWidth(PointeeTy) == 32)
	PointeeTy = UnsignedIntTy;
	else
	if (BT->getKind() == BuiltinType::Long && getIntWidth(PointeeTy) == 32)
	PointeeTy = IntTy;
	}
	}
	}

	void ASTContext::getObjCEncodingForType(QualType T, std::string& S,
	const FieldDecl *Field,
	QualType *NotEncodedT) const {
	// We follow the behavior of gcc, expanding structures which are
	// directly pointed to, and expanding embedded structures. Note that
	// these rules are sufficient to prevent recursive encoding of the
	// same type.
	getObjCEncodingForTypeImpl(T, S,
	ObjCEncOptions()
	.setExpandPointedToStructures()
	.setExpandStructures()
	.setIsOutermostType(),
	Field, NotEncodedT);
	}

	void ASTContext::getObjCEncodingForPropertyType(QualType T,
	std::string& S) const {
	// Encode result type.
	// GCC has some special rules regarding encoding of properties which
	// closely resembles encoding of ivars.
	getObjCEncodingForTypeImpl(T, S,
	ObjCEncOptions()
	.setExpandPointedToStructures()
	.setExpandStructures()
	.setIsOutermostType()
	.setEncodingProperty(),
	/Field=/nullptr);
	}

	static char getObjCEncodingForPrimitiveType(const ASTContext *C,
	const BuiltinType *BT) {
	BuiltinType::Kind kind = BT->getKind();
	switch (kind) {
	case BuiltinType::Void: return 'v';
	case BuiltinType::Bool: return 'B';
	case BuiltinType::Char8:
	case BuiltinType::Char_U:
	case BuiltinType::UChar: return 'C';
	case BuiltinType::Char16:
	case BuiltinType::UShort: return 'S';
	case BuiltinType::Char32:
	case BuiltinType::UInt: return 'I';
	case BuiltinType::ULong:
	return C->getTargetInfo().getLongWidth() == 32 ? 'L' : 'Q';
	case BuiltinType::UInt128: return 'T';
	case BuiltinType::ULongLong: return 'Q';
	case BuiltinType::Char_S:
	case BuiltinType::SChar: return 'c';
	case BuiltinType::Short: return 's';
	case BuiltinType::WChar_S:
	case BuiltinType::WChar_U:
	case BuiltinType::Int: return 'i';
	case BuiltinType::Long:
	return C->getTargetInfo().getLongWidth() == 32 ? 'l' : 'q';
	case BuiltinType::LongLong: return 'q';
	case BuiltinType::Int128: return 't';
	case BuiltinType::Float: return 'f';
	case BuiltinType::Double: return 'd';
	case BuiltinType::LongDouble: return 'D';
	case BuiltinType::NullPtr: return ''; // like char

	case BuiltinType::BFloat16:
	case BuiltinType::Float16:
	case BuiltinType::Float128:
	case BuiltinType::Half:
	case BuiltinType::ShortAccum:
	case BuiltinType::Accum:
	case BuiltinType::LongAccum:
	case BuiltinType::UShortAccum:
	case BuiltinType::UAccum:
	case BuiltinType::ULongAccum:
	case BuiltinType::ShortFract:
	case BuiltinType::Fract:
	case BuiltinType::LongFract:
	case BuiltinType::UShortFract:
	case BuiltinType::UFract:
	case BuiltinType::ULongFract:
	case BuiltinType::SatShortAccum:
	case BuiltinType::SatAccum:
	case BuiltinType::SatLongAccum:
	case BuiltinType::SatUShortAccum:
	case BuiltinType::SatUAccum:
	case BuiltinType::SatULongAccum:
	case BuiltinType::SatShortFract:
	case BuiltinType::SatFract:
	case BuiltinType::SatLongFract:
	case BuiltinType::SatUShortFract:
	case BuiltinType::SatUFract:
	case BuiltinType::SatULongFract:
	// FIXME: potentially need @encodes for these!
	return ' ';

	#define SVE_TYPE(Name, Id, SingletonId) \
	case BuiltinType::Id:
	#include "clang/Basic/AArch64SVEACLETypes.def"
	{
	DiagnosticsEngine &Diags = C->getDiagnostics();
	unsigned DiagID = Diags.getCustomDiagID(
	DiagnosticsEngine::Error, "cannot yet @encode type %0");
	Diags.Report(DiagID) << BT->getName(C->getPrintingPolicy());
	return ' ';
	}

	case BuiltinType::ObjCId:
	case BuiltinType::ObjCClass:
	case BuiltinType::ObjCSel:
	llvm_unreachable("@encoding ObjC primitive type");

	// OpenCL and placeholder types don't need @encodings.
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	#define EXT_OPAQUE_TYPE(ExtType, Id, Ext) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLExtensionTypes.def"
	case BuiltinType::OCLEvent:
	case BuiltinType::OCLClkEvent:
	case BuiltinType::OCLQueue:
	case BuiltinType::OCLReserveID:
	case BuiltinType::OCLSampler:
	case BuiltinType::Dependent:
	#define BUILTIN_TYPE(KIND, ID)
	#define PLACEHOLDER_TYPE(KIND, ID) \
	case BuiltinType::KIND:
	#include "clang/AST/BuiltinTypes.def"
	llvm_unreachable("invalid builtin type for @encode");
	}
	llvm_unreachable("invalid BuiltinType::Kind value");
	}

	static char ObjCEncodingForEnumType(const ASTContext C, const EnumType ET) {
	EnumDecl *Enum = ET->getDecl();

	// The encoding of an non-fixed enum type is always 'i', regardless of size.
	if (!Enum->isFixed())
	return 'i';

	// The encoding of a fixed enum type matches its fixed underlying type.
	const auto *BT = Enum->getIntegerType()->castAs<BuiltinType>();
	return getObjCEncodingForPrimitiveType(C, BT);
	}

	static void EncodeBitField(const ASTContext *Ctx, std::string& S,
	QualType T, const FieldDecl *FD) {
	assert(FD->isBitField() && "not a bitfield - getObjCEncodingForTypeImpl");
	S += 'b';
	// The NeXT runtime encodes bit fields as b followed by the number of bits.
	// The GNU runtime requires more information; bitfields are encoded as b,
	// then the offset (in bits) of the first element, then the type of the
	// bitfield, then the size in bits. For example, in this structure:
	//
	// struct
	// {
	// int integer;
	// int flags:2;
	// };
	// On a 32-bit system, the encoding for flags would be b2 for the NeXT
	// runtime, but b32i2 for the GNU runtime. The reason for this extra
	// information is not especially sensible, but we're stuck with it for
	// compatibility with GCC, although providing it breaks anything that
	// actually uses runtime introspection and wants to work on both runtimes...
	if (Ctx->getLangOpts().ObjCRuntime.isGNUFamily()) {
	uint64_t Offset;

	if (const auto *IVD = dyn_cast<ObjCIvarDecl>(FD)) {
	Offset = Ctx->lookupFieldBitOffset(IVD->getContainingInterface(), nullptr,
	IVD);
	} else {
	const RecordDecl *RD = FD->getParent();
	const ASTRecordLayout &RL = Ctx->getASTRecordLayout(RD);
	Offset = RL.getFieldOffset(FD->getFieldIndex());
	}

	S += llvm::utostr(Offset);

	if (const auto *ET = T->getAs<EnumType>())
	S += ObjCEncodingForEnumType(Ctx, ET);
	else {
	const auto *BT = T->castAs<BuiltinType>();
	S += getObjCEncodingForPrimitiveType(Ctx, BT);
	}
	}
	S += llvm::utostr(FD->getBitWidthValue(*Ctx));
	}

	// FIXME: Use SmallString for accumulating string.
	void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string &S,
	const ObjCEncOptions Options,
	const FieldDecl *FD,
	QualType *NotEncodedT) const {
	CanQualType CT = getCanonicalType(T);
	switch (CT->getTypeClass()) {
	case Type::Builtin:
	case Type::Enum:
	if (FD && FD->isBitField())
	return EncodeBitField(this, S, T, FD);
	if (const auto *BT = dyn_cast<BuiltinType>(CT))
	S += getObjCEncodingForPrimitiveType(this, BT);
	else
	S += ObjCEncodingForEnumType(this, cast<EnumType>(CT));
	return;

	case Type::Complex:
	S += 'j';
	getObjCEncodingForTypeImpl(T->castAs<ComplexType>()->getElementType(), S,
	ObjCEncOptions(),
	/Field=/nullptr);
	return;

	case Type::Atomic:
	S += 'A';
	getObjCEncodingForTypeImpl(T->castAs<AtomicType>()->getValueType(), S,
	ObjCEncOptions(),
	/Field=/nullptr);
	return;

	// encoding for pointer or reference types.
	case Type::Pointer:
	case Type::LValueReference:
	case Type::RValueReference: {
	QualType PointeeTy;
	if (isa<PointerType>(CT)) {
	const auto *PT = T->castAs<PointerType>();
	if (PT->isObjCSelType()) {
	S += ':';
	return;
	}
	PointeeTy = PT->getPointeeType();
	} else {
	PointeeTy = T->castAs<ReferenceType>()->getPointeeType();
	}

	bool isReadOnly = false;
	// For historical/compatibility reasons, the read-only qualifier of the
	// pointee gets emitted _before_ the '^'. The read-only qualifier of
	// the pointer itself gets ignored, _unless_ we are looking at a typedef!
	// Also, do not emit the 'r' for anything but the outermost type!
	if (isa<TypedefType>(T.getTypePtr())) {
	if (Options.IsOutermostType() && T.isConstQualified()) {
	isReadOnly = true;
	S += 'r';
	}
	} else if (Options.IsOutermostType()) {
	QualType P = PointeeTy;
	while (auto PT = P->getAs<PointerType>())
	P = PT->getPointeeType();
	if (P.isConstQualified()) {
	isReadOnly = true;
	S += 'r';
	}
	}
	if (isReadOnly) {
	// Another legacy compatibility encoding. Some ObjC qualifier and type
	// combinations need to be rearranged.
	// Rewrite "in const" from "nr" to "rn"
	if (StringRef(S).endswith("nr"))
	S.replace(S.end()-2, S.end(), "rn");
	}

	if (PointeeTy->isCharType()) {
	// char pointer types should be encoded as '*' unless it is a
	// type that has been typedef'd to 'BOOL'.
	if (!isTypeTypedefedAsBOOL(PointeeTy)) {
	S += '*';
	return;
	}
	} else if (const auto *RTy = PointeeTy->getAs<RecordType>()) {
	// GCC binary compat: Need to convert "struct objc_class *" to "#".
	if (RTy->getDecl()->getIdentifier() == &Idents.get("objc_class")) {
	S += '#';
	return;
	}
	// GCC binary compat: Need to convert "struct objc_object *" to "@".
	if (RTy->getDecl()->getIdentifier() == &Idents.get("objc_object")) {
	S += '@';
	return;
	}
	// fall through...
	}
	S += '^';
	getLegacyIntegralTypeEncoding(PointeeTy);

	ObjCEncOptions NewOptions;
	if (Options.ExpandPointedToStructures())
	NewOptions.setExpandStructures();
	getObjCEncodingForTypeImpl(PointeeTy, S, NewOptions,
	/Field=/nullptr, NotEncodedT);
	return;
	}

	case Type::ConstantArray:
	case Type::IncompleteArray:
	case Type::VariableArray: {
	const auto *AT = cast<ArrayType>(CT);

	if (isa<IncompleteArrayType>(AT) && !Options.IsStructField()) {
	// Incomplete arrays are encoded as a pointer to the array element.
	S += '^';

	getObjCEncodingForTypeImpl(
	AT->getElementType(), S,
	Options.keepingOnly(ObjCEncOptions().setExpandStructures()), FD);
	} else {
	S += '[';

	if (const auto *CAT = dyn_cast<ConstantArrayType>(AT))
	S += llvm::utostr(CAT->getSize().getZExtValue());
	else {
	//Variable length arrays are encoded as a regular array with 0 elements.
	assert((isa<VariableArrayType>(AT) \|\| isa<IncompleteArrayType>(AT)) &&
	"Unknown array type!");
	S += '0';
	}

	getObjCEncodingForTypeImpl(
	AT->getElementType(), S,
	Options.keepingOnly(ObjCEncOptions().setExpandStructures()), FD,
	NotEncodedT);
	S += ']';
	}
	return;
	}

	case Type::FunctionNoProto:
	case Type::FunctionProto:
	S += '?';
	return;

	case Type::Record: {
	RecordDecl *RDecl = cast<RecordType>(CT)->getDecl();
	S += RDecl->isUnion() ? '(' : '{';
	// Anonymous structures print as '?'
	if (const IdentifierInfo *II = RDecl->getIdentifier()) {
	S += II->getName();
	if (const auto *Spec = dyn_cast<ClassTemplateSpecializationDecl>(RDecl)) {
	const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
	llvm::raw_string_ostream OS(S);
	printTemplateArgumentList(OS, TemplateArgs.asArray(),
	getPrintingPolicy());
	}
	} else {
	S += '?';
	}
	if (Options.ExpandStructures()) {
	S += '=';
	if (!RDecl->isUnion()) {
	getObjCEncodingForStructureImpl(RDecl, S, FD, true, NotEncodedT);
	} else {
	for (const auto *Field : RDecl->fields()) {
	if (FD) {
	S += '"';
	S += Field->getNameAsString();
	S += '"';
	}

	// Special case bit-fields.
	if (Field->isBitField()) {
	getObjCEncodingForTypeImpl(Field->getType(), S,
	ObjCEncOptions().setExpandStructures(),
	Field);
	} else {
	QualType qt = Field->getType();
	getLegacyIntegralTypeEncoding(qt);
	getObjCEncodingForTypeImpl(
	qt, S,
	ObjCEncOptions().setExpandStructures().setIsStructField(), FD,
	NotEncodedT);
	}
	}
	}
	}
	S += RDecl->isUnion() ? ')' : '}';
	return;
	}

	case Type::BlockPointer: {
	const auto *BT = T->castAs<BlockPointerType>();
	S += "@?"; // Unlike a pointer-to-function, which is "^?".
	if (Options.EncodeBlockParameters()) {
	const auto *FT = BT->getPointeeType()->castAs<FunctionType>();

	S += '<';
	// Block return type
	getObjCEncodingForTypeImpl(FT->getReturnType(), S,
	Options.forComponentType(), FD, NotEncodedT);
	// Block self
	S += "@?";
	// Block parameters
	if (const auto *FPT = dyn_cast<FunctionProtoType>(FT)) {
	for (const auto &I : FPT->param_types())
	getObjCEncodingForTypeImpl(I, S, Options.forComponentType(), FD,
	NotEncodedT);
	}
	S += '>';
	}
	return;
	}

	case Type::ObjCObject: {
	// hack to match legacy encoding of id and Class
	QualType Ty = getObjCObjectPointerType(CT);
	if (Ty->isObjCIdType()) {
	S += "{objc_object=}";
	return;
	}
	else if (Ty->isObjCClassType()) {
	S += "{objc_class=}";
	return;
	}
	// TODO: Double check to make sure this intentionally falls through.
	LLVM_FALLTHROUGH;
	}

	case Type::ObjCInterface: {
	// Ignore protocol qualifiers when mangling at this level.
	// @encode(class_name)
	ObjCInterfaceDecl *OI = T->castAs<ObjCObjectType>()->getInterface();
	S += '{';
	S += OI->getObjCRuntimeNameAsString();
	if (Options.ExpandStructures()) {
	S += '=';
	SmallVector<const ObjCIvarDecl*, 32> Ivars;
	DeepCollectObjCIvars(OI, true, Ivars);
	for (unsigned i = 0, e = Ivars.size(); i != e; ++i) {
	const FieldDecl *Field = Ivars[i];
	if (Field->isBitField())
	getObjCEncodingForTypeImpl(Field->getType(), S,
	ObjCEncOptions().setExpandStructures(),
	Field);
	else
	getObjCEncodingForTypeImpl(Field->getType(), S,
	ObjCEncOptions().setExpandStructures(), FD,
	NotEncodedT);
	}
	}
	S += '}';
	return;
	}

	case Type::ObjCObjectPointer: {
	const auto *OPT = T->castAs<ObjCObjectPointerType>();
	if (OPT->isObjCIdType()) {
	S += '@';
	return;
	}

	if (OPT->isObjCClassType() \|\| OPT->isObjCQualifiedClassType()) {
	// FIXME: Consider if we need to output qualifiers for 'Class<p>'.
	// Since this is a binary compatibility issue, need to consult with
	// runtime folks. Fortunately, this is a very obscure construct.
	S += '#';
	return;
	}

	if (OPT->isObjCQualifiedIdType()) {
	getObjCEncodingForTypeImpl(
	getObjCIdType(), S,
	Options.keepingOnly(ObjCEncOptions()
	.setExpandPointedToStructures()
	.setExpandStructures()),
	FD);
	if (FD \|\| Options.EncodingProperty() \|\| Options.EncodeClassNames()) {
	// Note that we do extended encoding of protocol qualifer list
	// Only when doing ivar or property encoding.
	S += '"';
	for (const auto *I : OPT->quals()) {
	S += '<';
	S += I->getObjCRuntimeNameAsString();
	S += '>';
	}
	S += '"';
	}
	return;
	}

	S += '@';
	if (OPT->getInterfaceDecl() &&
	(FD \|\| Options.EncodingProperty() \|\| Options.EncodeClassNames())) {
	S += '"';
	S += OPT->getInterfaceDecl()->getObjCRuntimeNameAsString();
	for (const auto *I : OPT->quals()) {
	S += '<';
	S += I->getObjCRuntimeNameAsString();
	S += '>';
	}
	S += '"';
	}
	return;
	}

	// gcc just blithely ignores member pointers.
	// FIXME: we should do better than that. 'M' is available.
	case Type::MemberPointer:
	// This matches gcc's encoding, even though technically it is insufficient.
	//FIXME. We should do a better job than gcc.
	case Type::Vector:
	case Type::ExtVector:
	// Until we have a coherent encoding of these three types, issue warning.
	if (NotEncodedT)
	*NotEncodedT = T;
	return;

	case Type::ConstantMatrix:
	if (NotEncodedT)
	*NotEncodedT = T;
	return;

	// We could see an undeduced auto type here during error recovery.
	// Just ignore it.
	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	return;

	case Type::Pipe:
	case Type::ExtInt:
	#define ABSTRACT_TYPE(KIND, BASE)
	#define TYPE(KIND, BASE)
	#define DEPENDENT_TYPE(KIND, BASE) \
	case Type::KIND:
	#define NON_CANONICAL_TYPE(KIND, BASE) \
	case Type::KIND:
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(KIND, BASE) \
	case Type::KIND:
	#include "clang/AST/TypeNodes.inc"
	llvm_unreachable("@encode for dependent type!");
	}
	llvm_unreachable("bad type kind!");
	}

	void ASTContext::getObjCEncodingForStructureImpl(RecordDecl *RDecl,
	std::string &S,
	const FieldDecl *FD,
	bool includeVBases,
	QualType *NotEncodedT) const {
	assert(RDecl && "Expected non-null RecordDecl");
	assert(!RDecl->isUnion() && "Should not be called for unions");
	if (!RDecl->getDefinition() \|\| RDecl->getDefinition()->isInvalidDecl())
	return;

	const auto *CXXRec = dyn_cast<CXXRecordDecl>(RDecl);
	std::multimap<uint64_t, NamedDecl *> FieldOrBaseOffsets;
	const ASTRecordLayout &layout = getASTRecordLayout(RDecl);

	if (CXXRec) {
	for (const auto &BI : CXXRec->bases()) {
	if (!BI.isVirtual()) {
	CXXRecordDecl *base = BI.getType()->getAsCXXRecordDecl();
	if (base->isEmpty())
	continue;
	uint64_t offs = toBits(layout.getBaseClassOffset(base));
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
	std::make_pair(offs, base));
	}
	}
	}

	unsigned i = 0;
	for (auto *Field : RDecl->fields()) {
	uint64_t offs = layout.getFieldOffset(i);
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
	std::make_pair(offs, Field));
	++i;
	}

	if (CXXRec && includeVBases) {
	for (const auto &BI : CXXRec->vbases()) {
	CXXRecordDecl *base = BI.getType()->getAsCXXRecordDecl();
	if (base->isEmpty())
	continue;
	uint64_t offs = toBits(layout.getVBaseClassOffset(base));
	if (offs >= uint64_t(toBits(layout.getNonVirtualSize())) &&
	FieldOrBaseOffsets.find(offs) == FieldOrBaseOffsets.end())
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.end(),
	std::make_pair(offs, base));
	}
	}

	CharUnits size;
	if (CXXRec) {
	size = includeVBases ? layout.getSize() : layout.getNonVirtualSize();
	} else {
	size = layout.getSize();
	}

	#ifndef NDEBUG
	uint64_t CurOffs = 0;
	#endif
	std::multimap<uint64_t, NamedDecl *>::iterator
	CurLayObj = FieldOrBaseOffsets.begin();

	if (CXXRec && CXXRec->isDynamicClass() &&
	(CurLayObj == FieldOrBaseOffsets.end() \|\| CurLayObj->first != 0)) {
	if (FD) {
	S += "\"_vptr$";
	std::string recname = CXXRec->getNameAsString();
	if (recname.empty()) recname = "?";
	S += recname;
	S += '"';
	}
	S += "^^?";
	#ifndef NDEBUG
	CurOffs += getTypeSize(VoidPtrTy);
	#endif
	}

	if (!RDecl->hasFlexibleArrayMember()) {
	// Mark the end of the structure.
	uint64_t offs = toBits(size);
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
	std::make_pair(offs, nullptr));
	}

	for (; CurLayObj != FieldOrBaseOffsets.end(); ++CurLayObj) {
	#ifndef NDEBUG
	assert(CurOffs <= CurLayObj->first);
	if (CurOffs < CurLayObj->first) {
	uint64_t padding = CurLayObj->first - CurOffs;
	// FIXME: There doesn't seem to be a way to indicate in the encoding that
	// packing/alignment of members is different that normal, in which case
	// the encoding will be out-of-sync with the real layout.
	// If the runtime switches to just consider the size of types without
	// taking into account alignment, we could make padding explicit in the
	// encoding (e.g. using arrays of chars). The encoding strings would be
	// longer then though.
	CurOffs += padding;
	}
	#endif

	NamedDecl *dcl = CurLayObj->second;
	if (!dcl)
	break; // reached end of structure.

	if (auto *base = dyn_cast<CXXRecordDecl>(dcl)) {
	// We expand the bases without their virtual bases since those are going
	// in the initial structure. Note that this differs from gcc which
	// expands virtual bases each time one is encountered in the hierarchy,
	// making the encoding type bigger than it really is.
	getObjCEncodingForStructureImpl(base, S, FD, /includeVBases/false,
	NotEncodedT);
	assert(!base->isEmpty());
	#ifndef NDEBUG
	CurOffs += toBits(getASTRecordLayout(base).getNonVirtualSize());
	#endif
	} else {
	const auto *field = cast<FieldDecl>(dcl);
	if (FD) {
	S += '"';
	S += field->getNameAsString();
	S += '"';
	}

	if (field->isBitField()) {
	EncodeBitField(this, S, field->getType(), field);
	#ifndef NDEBUG
	CurOffs += field->getBitWidthValue(*this);
	#endif
	} else {
	QualType qt = field->getType();
	getLegacyIntegralTypeEncoding(qt);
	getObjCEncodingForTypeImpl(
	qt, S, ObjCEncOptions().setExpandStructures().setIsStructField(),
	FD, NotEncodedT);
	#ifndef NDEBUG
	CurOffs += getTypeSize(field->getType());
	#endif
	}
	}
	}
	}

	void ASTContext::getObjCEncodingForTypeQualifier(Decl::ObjCDeclQualifier QT,
	std::string& S) const {
	if (QT & Decl::OBJC_TQ_In)
	S += 'n';
	if (QT & Decl::OBJC_TQ_Inout)
	S += 'N';
	if (QT & Decl::OBJC_TQ_Out)
	S += 'o';
	if (QT & Decl::OBJC_TQ_Bycopy)
	S += 'O';
	if (QT & Decl::OBJC_TQ_Byref)
	S += 'R';
	if (QT & Decl::OBJC_TQ_Oneway)
	S += 'V';
	}

	TypedefDecl *ASTContext::getObjCIdDecl() const {
	if (!ObjCIdDecl) {
	QualType T = getObjCObjectType(ObjCBuiltinIdTy, {}, {});
	T = getObjCObjectPointerType(T);
	ObjCIdDecl = buildImplicitTypedef(T, "id");
	}
	return ObjCIdDecl;
	}

	TypedefDecl *ASTContext::getObjCSelDecl() const {
	if (!ObjCSelDecl) {
	QualType T = getPointerType(ObjCBuiltinSelTy);
	ObjCSelDecl = buildImplicitTypedef(T, "SEL");
	}
	return ObjCSelDecl;
	}

	TypedefDecl *ASTContext::getObjCClassDecl() const {
	if (!ObjCClassDecl) {
	QualType T = getObjCObjectType(ObjCBuiltinClassTy, {}, {});
	T = getObjCObjectPointerType(T);
	ObjCClassDecl = buildImplicitTypedef(T, "Class");
	}
	return ObjCClassDecl;
	}

	ObjCInterfaceDecl *ASTContext::getObjCProtocolDecl() const {
	if (!ObjCProtocolClassDecl) {
	ObjCProtocolClassDecl
	= ObjCInterfaceDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	&Idents.get("Protocol"),
	/typeParamList=/nullptr,
	/PrevDecl=/nullptr,
	SourceLocation(), true);
	}

	return ObjCProtocolClassDecl;
	}

	//===----------------------------------------------------------------------===//
	// __builtin_va_list Construction Functions
	//===----------------------------------------------------------------------===//

	static TypedefDecl CreateCharPtrNamedVaListDecl(const ASTContext Context,
	StringRef Name) {
	// typedef char* __builtin[_ms]_va_list;
	QualType T = Context->getPointerType(Context->CharTy);
	return Context->buildImplicitTypedef(T, Name);
	}

	static TypedefDecl CreateMSVaListDecl(const ASTContext Context) {
	return CreateCharPtrNamedVaListDecl(Context, "__builtin_ms_va_list");
	}

	static TypedefDecl CreateCharPtrBuiltinVaListDecl(const ASTContext Context) {
	return CreateCharPtrNamedVaListDecl(Context, "__builtin_va_list");
	}

	static TypedefDecl CreateVoidPtrBuiltinVaListDecl(const ASTContext Context) {
	// typedef void* __builtin_va_list;
	QualType T = Context->getPointerType(Context->VoidTy);
	return Context->buildImplicitTypedef(T, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateAArch64ABIBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list
	RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list");
	if (Context->getLangOpts().CPlusPlus) {
	// namespace std { struct __va_list {
	NamespaceDecl *NS;
	NS = NamespaceDecl::Create(const_cast<ASTContext &>(*Context),
	Context->getTranslationUnitDecl(),
	/Inline/ false, SourceLocation(),
	SourceLocation(), &Context->Idents.get("std"),
	/PrevDecl/ nullptr);
	NS->setImplicit();
	VaListTagDecl->setDeclContext(NS);
	}

	VaListTagDecl->startDefinition();

	const size_t NumFields = 5;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// void *__stack;
	FieldTypes[0] = Context->getPointerType(Context->VoidTy);
	FieldNames[0] = "__stack";

	// void *__gr_top;
	FieldTypes[1] = Context->getPointerType(Context->VoidTy);
	FieldNames[1] = "__gr_top";

	// void *__vr_top;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "__vr_top";

	// int __gr_offs;
	FieldTypes[3] = Context->IntTy;
	FieldNames[3] = "__gr_offs";

	// int __vr_offs;
	FieldTypes[4] = Context->IntTy;
	FieldNames[4] = "__vr_offs";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// } __builtin_va_list;
	return Context->buildImplicitTypedef(VaListTagType, "__builtin_va_list");
	}

	static TypedefDecl CreatePowerABIBuiltinVaListDecl(const ASTContext Context) {
	// typedef struct __va_list_tag {
	RecordDecl *VaListTagDecl;

	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 5;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// unsigned char gpr;
	FieldTypes[0] = Context->UnsignedCharTy;
	FieldNames[0] = "gpr";

	// unsigned char fpr;
	FieldTypes[1] = Context->UnsignedCharTy;
	FieldNames[1] = "fpr";

	// unsigned short reserved;
	FieldTypes[2] = Context->UnsignedShortTy;
	FieldNames[2] = "reserved";

	// void* overflow_arg_area;
	FieldTypes[3] = Context->getPointerType(Context->VoidTy);
	FieldNames[3] = "overflow_arg_area";

	// void* reg_save_area;
	FieldTypes[4] = Context->getPointerType(Context->VoidTy);
	FieldNames[4] = "reg_save_area";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(Context, VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// } __va_list_tag;
	TypedefDecl *VaListTagTypedefDecl =
	Context->buildImplicitTypedef(VaListTagType, "__va_list_tag");

	QualType VaListTagTypedefType =
	Context->getTypedefType(VaListTagTypedefDecl);

	// typedef __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType
	= Context->getConstantArrayType(VaListTagTypedefType,
	Size, nullptr, ArrayType::Normal, 0);
	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateX86_64ABIBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list_tag {
	RecordDecl *VaListTagDecl;
	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 4;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// unsigned gp_offset;
	FieldTypes[0] = Context->UnsignedIntTy;
	FieldNames[0] = "gp_offset";

	// unsigned fp_offset;
	FieldTypes[1] = Context->UnsignedIntTy;
	FieldNames[1] = "fp_offset";

	// void* overflow_arg_area;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "overflow_arg_area";

	// void* reg_save_area;
	FieldTypes[3] = Context->getPointerType(Context->VoidTy);
	FieldNames[3] = "reg_save_area";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// };

	// typedef struct __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType = Context->getConstantArrayType(
	VaListTagType, Size, nullptr, ArrayType::Normal, 0);
	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl CreatePNaClABIBuiltinVaListDecl(const ASTContext Context) {
	// typedef int __builtin_va_list[4];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 4);
	QualType IntArrayType = Context->getConstantArrayType(
	Context->IntTy, Size, nullptr, ArrayType::Normal, 0);
	return Context->buildImplicitTypedef(IntArrayType, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateAAPCSABIBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list
	RecordDecl *VaListDecl = Context->buildImplicitRecord("__va_list");
	if (Context->getLangOpts().CPlusPlus) {
	// namespace std { struct __va_list {
	NamespaceDecl *NS;
	NS = NamespaceDecl::Create(const_cast<ASTContext &>(*Context),
	Context->getTranslationUnitDecl(),
	/Inline/false, SourceLocation(),
	SourceLocation(), &Context->Idents.get("std"),
	/PrevDecl/ nullptr);
	NS->setImplicit();
	VaListDecl->setDeclContext(NS);
	}

	VaListDecl->startDefinition();

	// void * __ap;
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get("__ap"),
	Context->getPointerType(Context->VoidTy),
	/TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListDecl->addDecl(Field);

	// };
	VaListDecl->completeDefinition();
	Context->VaListTagDecl = VaListDecl;

	// typedef struct __va_list __builtin_va_list;
	QualType T = Context->getRecordType(VaListDecl);
	return Context->buildImplicitTypedef(T, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateSystemZBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list_tag {
	RecordDecl *VaListTagDecl;
	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 4;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// long __gpr;
	FieldTypes[0] = Context->LongTy;
	FieldNames[0] = "__gpr";

	// long __fpr;
	FieldTypes[1] = Context->LongTy;
	FieldNames[1] = "__fpr";

	// void *__overflow_arg_area;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "__overflow_arg_area";

	// void *__reg_save_area;
	FieldTypes[3] = Context->getPointerType(Context->VoidTy);
	FieldNames[3] = "__reg_save_area";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// };

	// typedef __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType = Context->getConstantArrayType(
	VaListTagType, Size, nullptr, ArrayType::Normal, 0);

	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl CreateHexagonBuiltinVaListDecl(const ASTContext Context) {
	// typedef struct __va_list_tag {
	RecordDecl *VaListTagDecl;
	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 3;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// void *CurrentSavedRegisterArea;
	FieldTypes[0] = Context->getPointerType(Context->VoidTy);
	FieldNames[0] = "__current_saved_reg_area_pointer";

	// void *SavedRegAreaEnd;
	FieldTypes[1] = Context->getPointerType(Context->VoidTy);
	FieldNames[1] = "__saved_reg_area_end_pointer";

	// void *OverflowArea;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "__overflow_area_pointer";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl *Field = FieldDecl::Create(
	const_cast<ASTContext &>(*Context), VaListTagDecl, SourceLocation(),
	SourceLocation(), &Context->Idents.get(FieldNames[i]), FieldTypes[i],
	/TInfo=/0,
	/BitWidth=/0,
	/Mutable=/false, ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// } __va_list_tag;
	TypedefDecl *VaListTagTypedefDecl =
	Context->buildImplicitTypedef(VaListTagType, "__va_list_tag");

	QualType VaListTagTypedefType = Context->getTypedefType(VaListTagTypedefDecl);

	// typedef __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType = Context->getConstantArrayType(
	VaListTagTypedefType, Size, nullptr, ArrayType::Normal, 0);

	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl CreateVaListDecl(const ASTContext Context,
	TargetInfo::BuiltinVaListKind Kind) {
	switch (Kind) {
	case TargetInfo::CharPtrBuiltinVaList:
	return CreateCharPtrBuiltinVaListDecl(Context);
	case TargetInfo::VoidPtrBuiltinVaList:
	return CreateVoidPtrBuiltinVaListDecl(Context);
	case TargetInfo::AArch64ABIBuiltinVaList:
	return CreateAArch64ABIBuiltinVaListDecl(Context);
	case TargetInfo::PowerABIBuiltinVaList:
	return CreatePowerABIBuiltinVaListDecl(Context);
	case TargetInfo::X86_64ABIBuiltinVaList:
	return CreateX86_64ABIBuiltinVaListDecl(Context);
	case TargetInfo::PNaClABIBuiltinVaList:
	return CreatePNaClABIBuiltinVaListDecl(Context);
	case TargetInfo::AAPCSABIBuiltinVaList:
	return CreateAAPCSABIBuiltinVaListDecl(Context);
	case TargetInfo::SystemZBuiltinVaList:
	return CreateSystemZBuiltinVaListDecl(Context);
	case TargetInfo::HexagonBuiltinVaList:
	return CreateHexagonBuiltinVaListDecl(Context);
	}

	llvm_unreachable("Unhandled __builtin_va_list type kind");
	}

	TypedefDecl *ASTContext::getBuiltinVaListDecl() const {
	if (!BuiltinVaListDecl) {
	BuiltinVaListDecl = CreateVaListDecl(this, Target->getBuiltinVaListKind());
	assert(BuiltinVaListDecl->isImplicit());
	}

	return BuiltinVaListDecl;
	}

	Decl *ASTContext::getVaListTagDecl() const {
	// Force the creation of VaListTagDecl by building the __builtin_va_list
	// declaration.
	if (!VaListTagDecl)
	(void)getBuiltinVaListDecl();

	return VaListTagDecl;
	}

	TypedefDecl *ASTContext::getBuiltinMSVaListDecl() const {
	if (!BuiltinMSVaListDecl)
	BuiltinMSVaListDecl = CreateMSVaListDecl(this);

	return BuiltinMSVaListDecl;
	}

	bool ASTContext::canBuiltinBeRedeclared(const FunctionDecl *FD) const {
	return BuiltinInfo.canBeRedeclared(FD->getBuiltinID());
	}

	void ASTContext::setObjCConstantStringInterface(ObjCInterfaceDecl *Decl) {
	assert(ObjCConstantStringType.isNull() &&
	"'NSConstantString' type already set!");

	ObjCConstantStringType = getObjCInterfaceType(Decl);
	}

	/// Retrieve the template name that corresponds to a non-empty
	/// lookup.
	TemplateName
	ASTContext::getOverloadedTemplateName(UnresolvedSetIterator Begin,
	UnresolvedSetIterator End) const {
	unsigned size = End - Begin;
	assert(size > 1 && "set is not overloaded!");

	void *memory = Allocate(sizeof(OverloadedTemplateStorage) +
	size * sizeof(FunctionTemplateDecl*));
	auto *OT = new (memory) OverloadedTemplateStorage(size);

	NamedDecl **Storage = OT->getStorage();
	for (UnresolvedSetIterator I = Begin; I != End; ++I) {
	NamedDecl D = I;
	assert(isa<FunctionTemplateDecl>(D) \|\|
	isa<UnresolvedUsingValueDecl>(D) \|\|
	(isa<UsingShadowDecl>(D) &&
	isa<FunctionTemplateDecl>(D->getUnderlyingDecl())));
	*Storage++ = D;
	}

	return TemplateName(OT);
	}

	/// Retrieve a template name representing an unqualified-id that has been
	/// assumed to name a template for ADL purposes.
	TemplateName ASTContext::getAssumedTemplateName(DeclarationName Name) const {
	auto OT = new (this) AssumedTemplateStorage(Name);
	return TemplateName(OT);
	}

	/// Retrieve the template name that represents a qualified
	/// template name such as \c std::vector.
	TemplateName
	ASTContext::getQualifiedTemplateName(NestedNameSpecifier *NNS,
	bool TemplateKeyword,
	TemplateDecl *Template) const {
	assert(NNS && "Missing nested-name-specifier in qualified template name");

	// FIXME: Canonicalization?
	llvm::FoldingSetNodeID ID;
	QualifiedTemplateName::Profile(ID, NNS, TemplateKeyword, Template);

	void *InsertPos = nullptr;
	QualifiedTemplateName *QTN =
	QualifiedTemplateNames.FindNodeOrInsertPos(ID, InsertPos);
	if (!QTN) {
	QTN = new (*this, alignof(QualifiedTemplateName))
	QualifiedTemplateName(NNS, TemplateKeyword, Template);
	QualifiedTemplateNames.InsertNode(QTN, InsertPos);
	}

	return TemplateName(QTN);
	}

	/// Retrieve the template name that represents a dependent
	/// template name such as \c MetaFun::template apply.
	TemplateName
	ASTContext::getDependentTemplateName(NestedNameSpecifier *NNS,
	const IdentifierInfo *Name) const {
	assert((!NNS \|\| NNS->isDependent()) &&
	"Nested name specifier must be dependent");

	llvm::FoldingSetNodeID ID;
	DependentTemplateName::Profile(ID, NNS, Name);

	void *InsertPos = nullptr;
	DependentTemplateName *QTN =
	DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);

	if (QTN)
	return TemplateName(QTN);

	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
	if (CanonNNS == NNS) {
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Name);
	} else {
	TemplateName Canon = getDependentTemplateName(CanonNNS, Name);
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Name, Canon);
	DependentTemplateName *CheckQTN =
	DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckQTN && "Dependent type name canonicalization broken");
	(void)CheckQTN;
	}

	DependentTemplateNames.InsertNode(QTN, InsertPos);
	return TemplateName(QTN);
	}

	/// Retrieve the template name that represents a dependent
	/// template name such as \c MetaFun::template operator+.
	TemplateName
	ASTContext::getDependentTemplateName(NestedNameSpecifier *NNS,
	OverloadedOperatorKind Operator) const {
	assert((!NNS \|\| NNS->isDependent()) &&
	"Nested name specifier must be dependent");

	llvm::FoldingSetNodeID ID;
	DependentTemplateName::Profile(ID, NNS, Operator);

	void *InsertPos = nullptr;
	DependentTemplateName *QTN
	= DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);

	if (QTN)
	return TemplateName(QTN);

	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
	if (CanonNNS == NNS) {
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Operator);
	} else {
	TemplateName Canon = getDependentTemplateName(CanonNNS, Operator);
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Operator, Canon);

	DependentTemplateName *CheckQTN
	= DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckQTN && "Dependent template name canonicalization broken");
	(void)CheckQTN;
	}

	DependentTemplateNames.InsertNode(QTN, InsertPos);
	return TemplateName(QTN);
	}

	TemplateName
	ASTContext::getSubstTemplateTemplateParm(TemplateTemplateParmDecl *param,
	TemplateName replacement) const {
	llvm::FoldingSetNodeID ID;
	SubstTemplateTemplateParmStorage::Profile(ID, param, replacement);

	void *insertPos = nullptr;
	SubstTemplateTemplateParmStorage *subst
	= SubstTemplateTemplateParms.FindNodeOrInsertPos(ID, insertPos);

	if (!subst) {
	subst = new (*this) SubstTemplateTemplateParmStorage(param, replacement);
	SubstTemplateTemplateParms.InsertNode(subst, insertPos);
	}

	return TemplateName(subst);
	}

	TemplateName
	ASTContext::getSubstTemplateTemplateParmPack(TemplateTemplateParmDecl *Param,
	const TemplateArgument &ArgPack) const {
	auto &Self = const_cast<ASTContext &>(*this);
	llvm::FoldingSetNodeID ID;
	SubstTemplateTemplateParmPackStorage::Profile(ID, Self, Param, ArgPack);

	void *InsertPos = nullptr;
	SubstTemplateTemplateParmPackStorage *Subst
	= SubstTemplateTemplateParmPacks.FindNodeOrInsertPos(ID, InsertPos);

	if (!Subst) {
	Subst = new (*this) SubstTemplateTemplateParmPackStorage(Param,
	ArgPack.pack_size(),
	ArgPack.pack_begin());
	SubstTemplateTemplateParmPacks.InsertNode(Subst, InsertPos);
	}

	return TemplateName(Subst);
	}

	/// getFromTargetType - Given one of the integer types provided by
	/// TargetInfo, produce the corresponding type. The unsigned @p Type
	/// is actually a value of type @c TargetInfo::IntType.
	CanQualType ASTContext::getFromTargetType(unsigned Type) const {
	switch (Type) {
	case TargetInfo::NoInt: return {};
	case TargetInfo::SignedChar: return SignedCharTy;
	case TargetInfo::UnsignedChar: return UnsignedCharTy;
	case TargetInfo::SignedShort: return ShortTy;
	case TargetInfo::UnsignedShort: return UnsignedShortTy;
	case TargetInfo::SignedInt: return IntTy;
	case TargetInfo::UnsignedInt: return UnsignedIntTy;
	case TargetInfo::SignedLong: return LongTy;
	case TargetInfo::UnsignedLong: return UnsignedLongTy;
	case TargetInfo::SignedLongLong: return LongLongTy;
	case TargetInfo::UnsignedLongLong: return UnsignedLongLongTy;
	}

	llvm_unreachable("Unhandled TargetInfo::IntType value");
	}

	//===----------------------------------------------------------------------===//
	// Type Predicates.
	//===----------------------------------------------------------------------===//

	/// getObjCGCAttr - Returns one of GCNone, Weak or Strong objc's
	/// garbage collection attribute.
	///
	Qualifiers::GC ASTContext::getObjCGCAttrKind(QualType Ty) const {
	if (getLangOpts().getGC() == LangOptions::NonGC)
	return Qualifiers::GCNone;

	assert(getLangOpts().ObjC);
	Qualifiers::GC GCAttrs = Ty.getObjCGCAttr();

	// Default behaviour under objective-C's gc is for ObjC pointers
	// (or pointers to them) be treated as though they were declared
	// as __strong.
	if (GCAttrs == Qualifiers::GCNone) {
	if (Ty->isObjCObjectPointerType() \|\| Ty->isBlockPointerType())
	return Qualifiers::Strong;
	else if (Ty->isPointerType())
	return getObjCGCAttrKind(Ty->castAs<PointerType>()->getPointeeType());
	} else {
	// It's not valid to set GC attributes on anything that isn't a
	// pointer.
	#ifndef NDEBUG
	QualType CT = Ty->getCanonicalTypeInternal();
	while (const auto *AT = dyn_cast<ArrayType>(CT))
	CT = AT->getElementType();
	assert(CT->isAnyPointerType() \|\| CT->isBlockPointerType());
	#endif
	}
	return GCAttrs;
	}

	//===----------------------------------------------------------------------===//
	// Type Compatibility Testing
	//===----------------------------------------------------------------------===//

	/// areCompatVectorTypes - Return true if the two specified vector types are
	/// compatible.
	static bool areCompatVectorTypes(const VectorType *LHS,
	const VectorType *RHS) {
	assert(LHS->isCanonicalUnqualified() && RHS->isCanonicalUnqualified());
	return LHS->getElementType() == RHS->getElementType() &&
	LHS->getNumElements() == RHS->getNumElements();
	}

	/// areCompatMatrixTypes - Return true if the two specified matrix types are
	/// compatible.
	static bool areCompatMatrixTypes(const ConstantMatrixType *LHS,
	const ConstantMatrixType *RHS) {
	assert(LHS->isCanonicalUnqualified() && RHS->isCanonicalUnqualified());
	return LHS->getElementType() == RHS->getElementType() &&
	LHS->getNumRows() == RHS->getNumRows() &&
	LHS->getNumColumns() == RHS->getNumColumns();
	}

	bool ASTContext::areCompatibleVectorTypes(QualType FirstVec,
	QualType SecondVec) {
	assert(FirstVec->isVectorType() && "FirstVec should be a vector type");
	assert(SecondVec->isVectorType() && "SecondVec should be a vector type");

	if (hasSameUnqualifiedType(FirstVec, SecondVec))
	return true;

	// Treat Neon vector types and most AltiVec vector types as if they are the
	// equivalent GCC vector types.
	const auto *First = FirstVec->castAs<VectorType>();
	const auto *Second = SecondVec->castAs<VectorType>();
	if (First->getNumElements() == Second->getNumElements() &&
	hasSameType(First->getElementType(), Second->getElementType()) &&
	First->getVectorKind() != VectorType::AltiVecPixel &&
	First->getVectorKind() != VectorType::AltiVecBool &&
	Second->getVectorKind() != VectorType::AltiVecPixel &&
	Second->getVectorKind() != VectorType::AltiVecBool)
	return true;

	return false;
	}

	bool ASTContext::hasDirectOwnershipQualifier(QualType Ty) const {
	while (true) {
	// __strong id
	if (const AttributedType *Attr = dyn_cast<AttributedType>(Ty)) {
	if (Attr->getAttrKind() == attr::ObjCOwnership)
	return true;

	Ty = Attr->getModifiedType();

	// X *__strong (...)
	} else if (const ParenType *Paren = dyn_cast<ParenType>(Ty)) {
	Ty = Paren->getInnerType();

	// We do not want to look through typedefs, typeof(expr),
	// typeof(type), or any other way that the type is somehow
	// abstracted.
	} else {
	return false;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// ObjCQualifiedIdTypesAreCompatible - Compatibility testing for qualified id's.
	//===----------------------------------------------------------------------===//

	/// ProtocolCompatibleWithProtocol - return 'true' if 'lProto' is in the
	/// inheritance hierarchy of 'rProto'.
	bool
	ASTContext::ProtocolCompatibleWithProtocol(ObjCProtocolDecl *lProto,
	ObjCProtocolDecl *rProto) const {
	if (declaresSameEntity(lProto, rProto))
	return true;
	for (auto *PI : rProto->protocols())
	if (ProtocolCompatibleWithProtocol(lProto, PI))
	return true;
	return false;
	}

	/// ObjCQualifiedClassTypesAreCompatible - compare Class<pr,...> and
	/// Class<pr1, ...>.
	bool ASTContext::ObjCQualifiedClassTypesAreCompatible(
	const ObjCObjectPointerType lhs, const ObjCObjectPointerType rhs) {
	for (auto *lhsProto : lhs->quals()) {
	bool match = false;
	for (auto *rhsProto : rhs->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto)) {
	match = true;
	break;
	}
	}
	if (!match)
	return false;
	}
	return true;
	}

	/// ObjCQualifiedIdTypesAreCompatible - We know that one of lhs/rhs is an
	/// ObjCQualifiedIDType.
	bool ASTContext::ObjCQualifiedIdTypesAreCompatible(
	const ObjCObjectPointerType lhs, const ObjCObjectPointerType rhs,
	bool compare) {
	// Allow id<P..> and an 'id' in all cases.
	if (lhs->isObjCIdType() \|\| rhs->isObjCIdType())
	return true;

	// Don't allow id<P..> to convert to Class or Class<P..> in either direction.
	if (lhs->isObjCClassType() \|\| lhs->isObjCQualifiedClassType() \|\|
	rhs->isObjCClassType() \|\| rhs->isObjCQualifiedClassType())
	return false;

	if (lhs->isObjCQualifiedIdType()) {
	if (rhs->qual_empty()) {
	// If the RHS is a unqualified interface pointer "NSString*",
	// make sure we check the class hierarchy.
	if (ObjCInterfaceDecl *rhsID = rhs->getInterfaceDecl()) {
	for (auto *I : lhs->quals()) {
	// when comparing an id<P> on lhs with a static type on rhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	if (!rhsID->ClassImplementsProtocol(I, true))
	return false;
	}
	}
	// If there are no qualifiers and no interface, we have an 'id'.
	return true;
	}
	// Both the right and left sides have qualifiers.
	for (auto *lhsProto : lhs->quals()) {
	bool match = false;

	// when comparing an id<P> on lhs with a static type on rhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	for (auto *rhsProto : rhs->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto) \|\|
	(compare && ProtocolCompatibleWithProtocol(rhsProto, lhsProto))) {
	match = true;
	break;
	}
	}
	// If the RHS is a qualified interface pointer "NSString<P>*",
	// make sure we check the class hierarchy.
	if (ObjCInterfaceDecl *rhsID = rhs->getInterfaceDecl()) {
	for (auto *I : lhs->quals()) {
	// when comparing an id<P> on lhs with a static type on rhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	if (rhsID->ClassImplementsProtocol(I, true)) {
	match = true;
	break;
	}
	}
	}
	if (!match)
	return false;
	}

	return true;
	}

	assert(rhs->isObjCQualifiedIdType() && "One of the LHS/RHS should be id<x>");

	if (lhs->getInterfaceType()) {
	// If both the right and left sides have qualifiers.
	for (auto *lhsProto : lhs->quals()) {
	bool match = false;

	// when comparing an id<P> on rhs with a static type on lhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	// First, lhs protocols in the qualifier list must be found, direct
	// or indirect in rhs's qualifier list or it is a mismatch.
	for (auto *rhsProto : rhs->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto) \|\|
	(compare && ProtocolCompatibleWithProtocol(rhsProto, lhsProto))) {
	match = true;
	break;
	}
	}
	if (!match)
	return false;
	}

	// Static class's protocols, or its super class or category protocols
	// must be found, direct or indirect in rhs's qualifier list or it is a mismatch.
	if (ObjCInterfaceDecl *lhsID = lhs->getInterfaceDecl()) {
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> LHSInheritedProtocols;
	CollectInheritedProtocols(lhsID, LHSInheritedProtocols);
	// This is rather dubious but matches gcc's behavior. If lhs has
	// no type qualifier and its class has no static protocol(s)
	// assume that it is mismatch.
	if (LHSInheritedProtocols.empty() && lhs->qual_empty())
	return false;
	for (auto *lhsProto : LHSInheritedProtocols) {
	bool match = false;
	for (auto *rhsProto : rhs->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto) \|\|
	(compare && ProtocolCompatibleWithProtocol(rhsProto, lhsProto))) {
	match = true;
	break;
	}
	}
	if (!match)
	return false;
	}
	}
	return true;
	}
	return false;
	}

	/// canAssignObjCInterfaces - Return true if the two interface types are
	/// compatible for assignment from RHS to LHS. This handles validation of any
	/// protocol qualifiers on the LHS or RHS.
	bool ASTContext::canAssignObjCInterfaces(const ObjCObjectPointerType *LHSOPT,
	const ObjCObjectPointerType *RHSOPT) {
	const ObjCObjectType* LHS = LHSOPT->getObjectType();
	const ObjCObjectType* RHS = RHSOPT->getObjectType();

	// If either type represents the built-in 'id' type, return true.
	if (LHS->isObjCUnqualifiedId() \|\| RHS->isObjCUnqualifiedId())
	return true;

	// Function object that propagates a successful result or handles
	// __kindof types.
	auto finish = [&](bool succeeded) -> bool {
	if (succeeded)
	return true;

	if (!RHS->isKindOfType())
	return false;

	// Strip off __kindof and protocol qualifiers, then check whether
	// we can assign the other way.
	return canAssignObjCInterfaces(RHSOPT->stripObjCKindOfTypeAndQuals(*this),
	LHSOPT->stripObjCKindOfTypeAndQuals(*this));
	};

	// Casts from or to id<P> are allowed when the other side has compatible
	// protocols.
	if (LHS->isObjCQualifiedId() \|\| RHS->isObjCQualifiedId()) {
	return finish(ObjCQualifiedIdTypesAreCompatible(LHSOPT, RHSOPT, false));
	}

	// Verify protocol compatibility for casts from Class<P1> to Class<P2>.
	if (LHS->isObjCQualifiedClass() && RHS->isObjCQualifiedClass()) {
	return finish(ObjCQualifiedClassTypesAreCompatible(LHSOPT, RHSOPT));
	}

	// Casts from Class to Class<Foo>, or vice-versa, are allowed.
	if (LHS->isObjCClass() && RHS->isObjCClass()) {
	return true;
	}

	// If we have 2 user-defined types, fall into that path.
	if (LHS->getInterface() && RHS->getInterface()) {
	return finish(canAssignObjCInterfaces(LHS, RHS));
	}

	return false;
	}

	/// canAssignObjCInterfacesInBlockPointer - This routine is specifically written
	/// for providing type-safety for objective-c pointers used to pass/return
	/// arguments in block literals. When passed as arguments, passing 'A*' where
	/// 'id' is expected is not OK. Passing 'Sub " where 'Super " is expected is
	/// not OK. For the return type, the opposite is not OK.
	bool ASTContext::canAssignObjCInterfacesInBlockPointer(
	const ObjCObjectPointerType *LHSOPT,
	const ObjCObjectPointerType *RHSOPT,
	bool BlockReturnType) {

	// Function object that propagates a successful result or handles
	// __kindof types.
	auto finish = [&](bool succeeded) -> bool {
	if (succeeded)
	return true;

	const ObjCObjectPointerType *Expected = BlockReturnType ? RHSOPT : LHSOPT;
	if (!Expected->isKindOfType())
	return false;

	// Strip off __kindof and protocol qualifiers, then check whether
	// we can assign the other way.
	return canAssignObjCInterfacesInBlockPointer(
	RHSOPT->stripObjCKindOfTypeAndQuals(*this),
	LHSOPT->stripObjCKindOfTypeAndQuals(*this),
	BlockReturnType);
	};

	if (RHSOPT->isObjCBuiltinType() \|\| LHSOPT->isObjCIdType())
	return true;

	if (LHSOPT->isObjCBuiltinType()) {
	return finish(RHSOPT->isObjCBuiltinType() \|\|
	RHSOPT->isObjCQualifiedIdType());
	}

	if (LHSOPT->isObjCQualifiedIdType() \|\| RHSOPT->isObjCQualifiedIdType()) {
	if (getLangOpts().CompatibilityQualifiedIdBlockParamTypeChecking)
	// Use for block parameters previous type checking for compatibility.
	return finish(ObjCQualifiedIdTypesAreCompatible(LHSOPT, RHSOPT, false) \|\|
	// Or corrected type checking as in non-compat mode.
	(!BlockReturnType &&
	ObjCQualifiedIdTypesAreCompatible(RHSOPT, LHSOPT, false)));
	else
	return finish(ObjCQualifiedIdTypesAreCompatible(
	(BlockReturnType ? LHSOPT : RHSOPT),
	(BlockReturnType ? RHSOPT : LHSOPT), false));
	}

	const ObjCInterfaceType* LHS = LHSOPT->getInterfaceType();
	const ObjCInterfaceType* RHS = RHSOPT->getInterfaceType();
	if (LHS && RHS) { // We have 2 user-defined types.
	if (LHS != RHS) {
	if (LHS->getDecl()->isSuperClassOf(RHS->getDecl()))
	return finish(BlockReturnType);
	if (RHS->getDecl()->isSuperClassOf(LHS->getDecl()))
	return finish(!BlockReturnType);
	}
	else
	return true;
	}
	return false;
	}

	/// Comparison routine for Objective-C protocols to be used with
	/// llvm::array_pod_sort.
	static int compareObjCProtocolsByName(ObjCProtocolDecl * const *lhs,
	ObjCProtocolDecl * const *rhs) {
	return (lhs)->getName().compare((rhs)->getName());
	}

	/// getIntersectionOfProtocols - This routine finds the intersection of set
	/// of protocols inherited from two distinct objective-c pointer objects with
	/// the given common base.
	/// It is used to build composite qualifier list of the composite type of
	/// the conditional expression involving two objective-c pointer objects.
	static
	void getIntersectionOfProtocols(ASTContext &Context,
	const ObjCInterfaceDecl *CommonBase,
	const ObjCObjectPointerType *LHSOPT,
	const ObjCObjectPointerType *RHSOPT,
	SmallVectorImpl<ObjCProtocolDecl *> &IntersectionSet) {

	const ObjCObjectType* LHS = LHSOPT->getObjectType();
	const ObjCObjectType* RHS = RHSOPT->getObjectType();
	assert(LHS->getInterface() && "LHS must have an interface base");
	assert(RHS->getInterface() && "RHS must have an interface base");

	// Add all of the protocols for the LHS.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> LHSProtocolSet;

	// Start with the protocol qualifiers.
	for (auto proto : LHS->quals()) {
	Context.CollectInheritedProtocols(proto, LHSProtocolSet);
	}

	// Also add the protocols associated with the LHS interface.
	Context.CollectInheritedProtocols(LHS->getInterface(), LHSProtocolSet);

	// Add all of the protocols for the RHS.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> RHSProtocolSet;

	// Start with the protocol qualifiers.
	for (auto proto : RHS->quals()) {
	Context.CollectInheritedProtocols(proto, RHSProtocolSet);
	}

	// Also add the protocols associated with the RHS interface.
	Context.CollectInheritedProtocols(RHS->getInterface(), RHSProtocolSet);

	// Compute the intersection of the collected protocol sets.
	for (auto proto : LHSProtocolSet) {
	if (RHSProtocolSet.count(proto))
	IntersectionSet.push_back(proto);
	}

	// Compute the set of protocols that is implied by either the common type or
	// the protocols within the intersection.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> ImpliedProtocols;
	Context.CollectInheritedProtocols(CommonBase, ImpliedProtocols);

	// Remove any implied protocols from the list of inherited protocols.
	if (!ImpliedProtocols.empty()) {
	IntersectionSet.erase(
	std::remove_if(IntersectionSet.begin(),
	IntersectionSet.end(),
	[&](ObjCProtocolDecl *proto) -> bool {
	return ImpliedProtocols.count(proto) > 0;
	}),
	IntersectionSet.end());
	}

	// Sort the remaining protocols by name.
	llvm::array_pod_sort(IntersectionSet.begin(), IntersectionSet.end(),
	compareObjCProtocolsByName);
	}

	/// Determine whether the first type is a subtype of the second.
	static bool canAssignObjCObjectTypes(ASTContext &ctx, QualType lhs,
	QualType rhs) {
	// Common case: two object pointers.
	const auto *lhsOPT = lhs->getAs<ObjCObjectPointerType>();
	const auto *rhsOPT = rhs->getAs<ObjCObjectPointerType>();
	if (lhsOPT && rhsOPT)
	return ctx.canAssignObjCInterfaces(lhsOPT, rhsOPT);

	// Two block pointers.
	const auto *lhsBlock = lhs->getAs<BlockPointerType>();
	const auto *rhsBlock = rhs->getAs<BlockPointerType>();
	if (lhsBlock && rhsBlock)
	return ctx.typesAreBlockPointerCompatible(lhs, rhs);

	// If either is an unqualified 'id' and the other is a block, it's
	// acceptable.
	if ((lhsOPT && lhsOPT->isObjCIdType() && rhsBlock) \|\|
	(rhsOPT && rhsOPT->isObjCIdType() && lhsBlock))
	return true;

	return false;
	}

	// Check that the given Objective-C type argument lists are equivalent.
	static bool sameObjCTypeArgs(ASTContext &ctx,
	const ObjCInterfaceDecl *iface,
	ArrayRef<QualType> lhsArgs,
	ArrayRef<QualType> rhsArgs,
	bool stripKindOf) {
	if (lhsArgs.size() != rhsArgs.size())
	return false;

	ObjCTypeParamList *typeParams = iface->getTypeParamList();
	for (unsigned i = 0, n = lhsArgs.size(); i != n; ++i) {
	if (ctx.hasSameType(lhsArgs[i], rhsArgs[i]))
	continue;

	switch (typeParams->begin()[i]->getVariance()) {
	case ObjCTypeParamVariance::Invariant:
	if (!stripKindOf \|\|
	!ctx.hasSameType(lhsArgs[i].stripObjCKindOfType(ctx),
	rhsArgs[i].stripObjCKindOfType(ctx))) {
	return false;
	}
	break;

	case ObjCTypeParamVariance::Covariant:
	if (!canAssignObjCObjectTypes(ctx, lhsArgs[i], rhsArgs[i]))
	return false;
	break;

	case ObjCTypeParamVariance::Contravariant:
	if (!canAssignObjCObjectTypes(ctx, rhsArgs[i], lhsArgs[i]))
	return false;
	break;
	}
	}

	return true;
	}

	QualType ASTContext::areCommonBaseCompatible(
	const ObjCObjectPointerType *Lptr,
	const ObjCObjectPointerType *Rptr) {
	const ObjCObjectType *LHS = Lptr->getObjectType();
	const ObjCObjectType *RHS = Rptr->getObjectType();
	const ObjCInterfaceDecl* LDecl = LHS->getInterface();
	const ObjCInterfaceDecl* RDecl = RHS->getInterface();

	if (!LDecl \|\| !RDecl)
	return {};

	// When either LHS or RHS is a kindof type, we should return a kindof type.
	// For example, for common base of kindof(ASub1) and kindof(ASub2), we return
	// kindof(A).
	bool anyKindOf = LHS->isKindOfType() \|\| RHS->isKindOfType();

	// Follow the left-hand side up the class hierarchy until we either hit a
	// root or find the RHS. Record the ancestors in case we don't find it.
	llvm::SmallDenseMap<const ObjCInterfaceDecl , const ObjCObjectType , 4>
	LHSAncestors;
	while (true) {
	// Record this ancestor. We'll need this if the common type isn't in the
	// path from the LHS to the root.
	LHSAncestors[LHS->getInterface()->getCanonicalDecl()] = LHS;

	if (declaresSameEntity(LHS->getInterface(), RDecl)) {
	// Get the type arguments.
	ArrayRef<QualType> LHSTypeArgs = LHS->getTypeArgsAsWritten();
	bool anyChanges = false;
	if (LHS->isSpecialized() && RHS->isSpecialized()) {
	// Both have type arguments, compare them.
	if (!sameObjCTypeArgs(*this, LHS->getInterface(),
	LHS->getTypeArgs(), RHS->getTypeArgs(),
	/stripKindOf=/true))
	return {};
	} else if (LHS->isSpecialized() != RHS->isSpecialized()) {
	// If only one has type arguments, the result will not have type
	// arguments.
	LHSTypeArgs = {};
	anyChanges = true;
	}

	// Compute the intersection of protocols.
	SmallVector<ObjCProtocolDecl *, 8> Protocols;
	getIntersectionOfProtocols(*this, LHS->getInterface(), Lptr, Rptr,
	Protocols);
	if (!Protocols.empty())
	anyChanges = true;

	// If anything in the LHS will have changed, build a new result type.
	// If we need to return a kindof type but LHS is not a kindof type, we
	// build a new result type.
	if (anyChanges \|\| LHS->isKindOfType() != anyKindOf) {
	QualType Result = getObjCInterfaceType(LHS->getInterface());
	Result = getObjCObjectType(Result, LHSTypeArgs, Protocols,
	anyKindOf \|\| LHS->isKindOfType());
	return getObjCObjectPointerType(Result);
	}

	return getObjCObjectPointerType(QualType(LHS, 0));
	}

	// Find the superclass.
	QualType LHSSuperType = LHS->getSuperClassType();
	if (LHSSuperType.isNull())
	break;

	LHS = LHSSuperType->castAs<ObjCObjectType>();
	}

	// We didn't find anything by following the LHS to its root; now check
	// the RHS against the cached set of ancestors.
	while (true) {
	auto KnownLHS = LHSAncestors.find(RHS->getInterface()->getCanonicalDecl());
	if (KnownLHS != LHSAncestors.end()) {
	LHS = KnownLHS->second;

	// Get the type arguments.
	ArrayRef<QualType> RHSTypeArgs = RHS->getTypeArgsAsWritten();
	bool anyChanges = false;
	if (LHS->isSpecialized() && RHS->isSpecialized()) {
	// Both have type arguments, compare them.
	if (!sameObjCTypeArgs(*this, LHS->getInterface(),
	LHS->getTypeArgs(), RHS->getTypeArgs(),
	/stripKindOf=/true))
	return {};
	} else if (LHS->isSpecialized() != RHS->isSpecialized()) {
	// If only one has type arguments, the result will not have type
	// arguments.
	RHSTypeArgs = {};
	anyChanges = true;
	}

	// Compute the intersection of protocols.
	SmallVector<ObjCProtocolDecl *, 8> Protocols;
	getIntersectionOfProtocols(*this, RHS->getInterface(), Lptr, Rptr,
	Protocols);
	if (!Protocols.empty())
	anyChanges = true;

	// If we need to return a kindof type but RHS is not a kindof type, we
	// build a new result type.
	if (anyChanges \|\| RHS->isKindOfType() != anyKindOf) {
	QualType Result = getObjCInterfaceType(RHS->getInterface());
	Result = getObjCObjectType(Result, RHSTypeArgs, Protocols,
	anyKindOf \|\| RHS->isKindOfType());
	return getObjCObjectPointerType(Result);
	}

	return getObjCObjectPointerType(QualType(RHS, 0));
	}

	// Find the superclass of the RHS.
	QualType RHSSuperType = RHS->getSuperClassType();
	if (RHSSuperType.isNull())
	break;

	RHS = RHSSuperType->castAs<ObjCObjectType>();
	}

	return {};
	}

	bool ASTContext::canAssignObjCInterfaces(const ObjCObjectType *LHS,
	const ObjCObjectType *RHS) {
	assert(LHS->getInterface() && "LHS is not an interface type");
	assert(RHS->getInterface() && "RHS is not an interface type");

	// Verify that the base decls are compatible: the RHS must be a subclass of
	// the LHS.
	ObjCInterfaceDecl *LHSInterface = LHS->getInterface();
	bool IsSuperClass = LHSInterface->isSuperClassOf(RHS->getInterface());
	if (!IsSuperClass)
	return false;

	// If the LHS has protocol qualifiers, determine whether all of them are
	// satisfied by the RHS (i.e., the RHS has a superset of the protocols in the
	// LHS).
	if (LHS->getNumProtocols() > 0) {
	// OK if conversion of LHS to SuperClass results in narrowing of types
	// ; i.e., SuperClass may implement at least one of the protocols
	// in LHS's protocol list. Example, SuperObj<P1> = lhs<P1,P2> is ok.
	// But not SuperObj<P1,P2,P3> = lhs<P1,P2>.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> SuperClassInheritedProtocols;
	CollectInheritedProtocols(RHS->getInterface(), SuperClassInheritedProtocols);
	// Also, if RHS has explicit quelifiers, include them for comparing with LHS's
	// qualifiers.
	for (auto *RHSPI : RHS->quals())
	CollectInheritedProtocols(RHSPI, SuperClassInheritedProtocols);
	// If there is no protocols associated with RHS, it is not a match.
	if (SuperClassInheritedProtocols.empty())
	return false;

	for (const auto *LHSProto : LHS->quals()) {
	bool SuperImplementsProtocol = false;
	for (auto *SuperClassProto : SuperClassInheritedProtocols)
	if (SuperClassProto->lookupProtocolNamed(LHSProto->getIdentifier())) {
	SuperImplementsProtocol = true;
	break;
	}
	if (!SuperImplementsProtocol)
	return false;
	}
	}

	// If the LHS is specialized, we may need to check type arguments.
	if (LHS->isSpecialized()) {
	// Follow the superclass chain until we've matched the LHS class in the
	// hierarchy. This substitutes type arguments through.
	const ObjCObjectType *RHSSuper = RHS;
	while (!declaresSameEntity(RHSSuper->getInterface(), LHSInterface))
	RHSSuper = RHSSuper->getSuperClassType()->castAs<ObjCObjectType>();

	// If the RHS is specializd, compare type arguments.
	if (RHSSuper->isSpecialized() &&
	!sameObjCTypeArgs(*this, LHS->getInterface(),
	LHS->getTypeArgs(), RHSSuper->getTypeArgs(),
	/stripKindOf=/true)) {
	return false;
	}
	}

	return true;
	}

	bool ASTContext::areComparableObjCPointerTypes(QualType LHS, QualType RHS) {
	// get the "pointed to" types
	const auto *LHSOPT = LHS->getAs<ObjCObjectPointerType>();
	const auto *RHSOPT = RHS->getAs<ObjCObjectPointerType>();

	if (!LHSOPT \|\| !RHSOPT)
	return false;

	return canAssignObjCInterfaces(LHSOPT, RHSOPT) \|\|
	canAssignObjCInterfaces(RHSOPT, LHSOPT);
	}

	bool ASTContext::canBindObjCObjectType(QualType To, QualType From) {
	return canAssignObjCInterfaces(
	getObjCObjectPointerType(To)->castAs<ObjCObjectPointerType>(),
	getObjCObjectPointerType(From)->castAs<ObjCObjectPointerType>());
	}

	/// typesAreCompatible - C99 6.7.3p9: For two qualified types to be compatible,
	/// both shall have the identically qualified version of a compatible type.
	/// C99 6.2.7p1: Two types have compatible types if their types are the
	/// same. See 6.7.[2,3,5] for additional rules.
	bool ASTContext::typesAreCompatible(QualType LHS, QualType RHS,
	bool CompareUnqualified) {
	if (getLangOpts().CPlusPlus)
	return hasSameType(LHS, RHS);

	return !mergeTypes(LHS, RHS, false, CompareUnqualified).isNull();
	}

	bool ASTContext::propertyTypesAreCompatible(QualType LHS, QualType RHS) {
	return typesAreCompatible(LHS, RHS);
	}

	bool ASTContext::typesAreBlockPointerCompatible(QualType LHS, QualType RHS) {
	return !mergeTypes(LHS, RHS, true).isNull();
	}

	/// mergeTransparentUnionType - if T is a transparent union type and a member
	/// of T is compatible with SubType, return the merged type, else return
	/// QualType()
	QualType ASTContext::mergeTransparentUnionType(QualType T, QualType SubType,
	bool OfBlockPointer,
	bool Unqualified) {
	if (const RecordType *UT = T->getAsUnionType()) {
	RecordDecl *UD = UT->getDecl();
	if (UD->hasAttr<TransparentUnionAttr>()) {
	for (const auto *I : UD->fields()) {
	QualType ET = I->getType().getUnqualifiedType();
	QualType MT = mergeTypes(ET, SubType, OfBlockPointer, Unqualified);
	if (!MT.isNull())
	return MT;
	}
	}
	}

	return {};
	}

	/// mergeFunctionParameterTypes - merge two types which appear as function
	/// parameter types
	QualType ASTContext::mergeFunctionParameterTypes(QualType lhs, QualType rhs,
	bool OfBlockPointer,
	bool Unqualified) {
	// GNU extension: two types are compatible if they appear as a function
	// argument, one of the types is a transparent union type and the other
	// type is compatible with a union member
	QualType lmerge = mergeTransparentUnionType(lhs, rhs, OfBlockPointer,
	Unqualified);
	if (!lmerge.isNull())
	return lmerge;

	QualType rmerge = mergeTransparentUnionType(rhs, lhs, OfBlockPointer,
	Unqualified);
	if (!rmerge.isNull())
	return rmerge;

	return mergeTypes(lhs, rhs, OfBlockPointer, Unqualified);
	}

	QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
	bool OfBlockPointer, bool Unqualified,
	bool AllowCXX) {
	const auto *lbase = lhs->castAs<FunctionType>();
	const auto *rbase = rhs->castAs<FunctionType>();
	const auto *lproto = dyn_cast<FunctionProtoType>(lbase);
	const auto *rproto = dyn_cast<FunctionProtoType>(rbase);
	bool allLTypes = true;
	bool allRTypes = true;

	// Check return type
	QualType retType;
	if (OfBlockPointer) {
	QualType RHS = rbase->getReturnType();
	QualType LHS = lbase->getReturnType();
	bool UnqualifiedResult = Unqualified;
	if (!UnqualifiedResult)
	UnqualifiedResult = (!RHS.hasQualifiers() && LHS.hasQualifiers());
	retType = mergeTypes(LHS, RHS, true, UnqualifiedResult, true);
	}
	else
	retType = mergeTypes(lbase->getReturnType(), rbase->getReturnType(), false,
	Unqualified);
	if (retType.isNull())
	return {};

	if (Unqualified)
	retType = retType.getUnqualifiedType();

	CanQualType LRetType = getCanonicalType(lbase->getReturnType());
	CanQualType RRetType = getCanonicalType(rbase->getReturnType());
	if (Unqualified) {
	LRetType = LRetType.getUnqualifiedType();
	RRetType = RRetType.getUnqualifiedType();
	}

	if (getCanonicalType(retType) != LRetType)
	allLTypes = false;
	if (getCanonicalType(retType) != RRetType)
	allRTypes = false;

	// FIXME: double check this
	// FIXME: should we error if lbase->getRegParmAttr() != 0 &&
	// rbase->getRegParmAttr() != 0 &&
	// lbase->getRegParmAttr() != rbase->getRegParmAttr()?
	FunctionType::ExtInfo lbaseInfo = lbase->getExtInfo();
	FunctionType::ExtInfo rbaseInfo = rbase->getExtInfo();

	// Compatible functions must have compatible calling conventions
	if (lbaseInfo.getCC() != rbaseInfo.getCC())
	return {};

	// Regparm is part of the calling convention.
	if (lbaseInfo.getHasRegParm() != rbaseInfo.getHasRegParm())
	return {};
	if (lbaseInfo.getRegParm() != rbaseInfo.getRegParm())
	return {};

	if (lbaseInfo.getProducesResult() != rbaseInfo.getProducesResult())
	return {};
	if (lbaseInfo.getNoCallerSavedRegs() != rbaseInfo.getNoCallerSavedRegs())
	return {};
	if (lbaseInfo.getNoCfCheck() != rbaseInfo.getNoCfCheck())
	return {};

	// FIXME: some uses, e.g. conditional exprs, really want this to be 'both'.
	bool NoReturn = lbaseInfo.getNoReturn() \|\| rbaseInfo.getNoReturn();

	if (lbaseInfo.getNoReturn() != NoReturn)
	allLTypes = false;
	if (rbaseInfo.getNoReturn() != NoReturn)
	allRTypes = false;

	FunctionType::ExtInfo einfo = lbaseInfo.withNoReturn(NoReturn);

	if (lproto && rproto) { // two C99 style function prototypes
	assert((AllowCXX \|\|
	(!lproto->hasExceptionSpec() && !rproto->hasExceptionSpec())) &&
	"C++ shouldn't be here");
	// Compatible functions must have the same number of parameters
	if (lproto->getNumParams() != rproto->getNumParams())
	return {};

	// Variadic and non-variadic functions aren't compatible
	if (lproto->isVariadic() != rproto->isVariadic())
	return {};

	if (lproto->getMethodQuals() != rproto->getMethodQuals())
	return {};

	SmallVector<FunctionProtoType::ExtParameterInfo, 4> newParamInfos;
	bool canUseLeft, canUseRight;
	if (!mergeExtParameterInfo(lproto, rproto, canUseLeft, canUseRight,
	newParamInfos))
	return {};

	if (!canUseLeft)
	allLTypes = false;
	if (!canUseRight)
	allRTypes = false;

	// Check parameter type compatibility
	SmallVector<QualType, 10> types;
	for (unsigned i = 0, n = lproto->getNumParams(); i < n; i++) {
	QualType lParamType = lproto->getParamType(i).getUnqualifiedType();
	QualType rParamType = rproto->getParamType(i).getUnqualifiedType();
	QualType paramType = mergeFunctionParameterTypes(
	lParamType, rParamType, OfBlockPointer, Unqualified);
	if (paramType.isNull())
	return {};

	if (Unqualified)
	paramType = paramType.getUnqualifiedType();

	types.push_back(paramType);
	if (Unqualified) {
	lParamType = lParamType.getUnqualifiedType();
	rParamType = rParamType.getUnqualifiedType();
	}

	if (getCanonicalType(paramType) != getCanonicalType(lParamType))
	allLTypes = false;
	if (getCanonicalType(paramType) != getCanonicalType(rParamType))
	allRTypes = false;
	}

	if (allLTypes) return lhs;
	if (allRTypes) return rhs;

	FunctionProtoType::ExtProtoInfo EPI = lproto->getExtProtoInfo();
	EPI.ExtInfo = einfo;
	EPI.ExtParameterInfos =
	newParamInfos.empty() ? nullptr : newParamInfos.data();
	return getFunctionType(retType, types, EPI);
	}

	if (lproto) allRTypes = false;
	if (rproto) allLTypes = false;

	const FunctionProtoType *proto = lproto ? lproto : rproto;
	if (proto) {
	assert((AllowCXX \|\| !proto->hasExceptionSpec()) && "C++ shouldn't be here");
	if (proto->isVariadic())
	return {};
	// Check that the types are compatible with the types that
	// would result from default argument promotions (C99 6.7.5.3p15).
	// The only types actually affected are promotable integer
	// types and floats, which would be passed as a different
	// type depending on whether the prototype is visible.
	for (unsigned i = 0, n = proto->getNumParams(); i < n; ++i) {
	QualType paramTy = proto->getParamType(i);

	// Look at the converted type of enum types, since that is the type used
	// to pass enum values.
	if (const auto *Enum = paramTy->getAs<EnumType>()) {
	paramTy = Enum->getDecl()->getIntegerType();
	if (paramTy.isNull())
	return {};
	}

	if (paramTy->isPromotableIntegerType() \|\|
	getCanonicalType(paramTy).getUnqualifiedType() == FloatTy)
	return {};
	}

	if (allLTypes) return lhs;
	if (allRTypes) return rhs;

	FunctionProtoType::ExtProtoInfo EPI = proto->getExtProtoInfo();
	EPI.ExtInfo = einfo;
	return getFunctionType(retType, proto->getParamTypes(), EPI);
	}

	if (allLTypes) return lhs;
	if (allRTypes) return rhs;
	return getFunctionNoProtoType(retType, einfo);
	}

	/// Given that we have an enum type and a non-enum type, try to merge them.
	static QualType mergeEnumWithInteger(ASTContext &Context, const EnumType *ET,
	QualType other, bool isBlockReturnType) {
	// C99 6.7.2.2p4: Each enumerated type shall be compatible with char,
	// a signed integer type, or an unsigned integer type.
	// Compatibility is based on the underlying type, not the promotion
	// type.
	QualType underlyingType = ET->getDecl()->getIntegerType();
	if (underlyingType.isNull())
	return {};
	if (Context.hasSameType(underlyingType, other))
	return other;

	// In block return types, we're more permissive and accept any
	// integral type of the same size.
	if (isBlockReturnType && other->isIntegerType() &&
	Context.getTypeSize(underlyingType) == Context.getTypeSize(other))
	return other;

	return {};
	}

	QualType ASTContext::mergeTypes(QualType LHS, QualType RHS,
	bool OfBlockPointer,
	bool Unqualified, bool BlockReturnType) {
	// C++ [expr]: If an expression initially has the type "reference to T", the
	// type is adjusted to "T" prior to any further analysis, the expression
	// designates the object or function denoted by the reference, and the
	// expression is an lvalue unless the reference is an rvalue reference and
	// the expression is a function call (possibly inside parentheses).
	assert(!LHS->getAs<ReferenceType>() && "LHS is a reference type?");
	assert(!RHS->getAs<ReferenceType>() && "RHS is a reference type?");

	if (Unqualified) {
	LHS = LHS.getUnqualifiedType();
	RHS = RHS.getUnqualifiedType();
	}

	QualType LHSCan = getCanonicalType(LHS),
	RHSCan = getCanonicalType(RHS);

	// If two types are identical, they are compatible.
	if (LHSCan == RHSCan)
	return LHS;

	// If the qualifiers are different, the types aren't compatible... mostly.
	Qualifiers LQuals = LHSCan.getLocalQualifiers();
	Qualifiers RQuals = RHSCan.getLocalQualifiers();
	if (LQuals != RQuals) {
	// If any of these qualifiers are different, we have a type
	// mismatch.
	if (LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers() \|\|
	LQuals.getAddressSpace() != RQuals.getAddressSpace() \|\|
	LQuals.getObjCLifetime() != RQuals.getObjCLifetime() \|\|
	LQuals.hasUnaligned() != RQuals.hasUnaligned())
	return {};

	// Exactly one GC qualifier difference is allowed: __strong is
	// okay if the other type has no GC qualifier but is an Objective
	// C object pointer (i.e. implicitly strong by default). We fix
	// this by pretending that the unqualified type was actually
	// qualified __strong.
	Qualifiers::GC GC_L = LQuals.getObjCGCAttr();
	Qualifiers::GC GC_R = RQuals.getObjCGCAttr();
	assert((GC_L != GC_R) && "unequal qualifier sets had only equal elements");

	if (GC_L == Qualifiers::Weak \|\| GC_R == Qualifiers::Weak)
	return {};

	if (GC_L == Qualifiers::Strong && RHSCan->isObjCObjectPointerType()) {
	return mergeTypes(LHS, getObjCGCQualType(RHS, Qualifiers::Strong));
	}
	if (GC_R == Qualifiers::Strong && LHSCan->isObjCObjectPointerType()) {
	return mergeTypes(getObjCGCQualType(LHS, Qualifiers::Strong), RHS);
	}
	return {};
	}

	// Okay, qualifiers are equal.

	Type::TypeClass LHSClass = LHSCan->getTypeClass();
	Type::TypeClass RHSClass = RHSCan->getTypeClass();

	// We want to consider the two function types to be the same for these
	// comparisons, just force one to the other.
	if (LHSClass == Type::FunctionProto) LHSClass = Type::FunctionNoProto;
	if (RHSClass == Type::FunctionProto) RHSClass = Type::FunctionNoProto;

	// Same as above for arrays
	if (LHSClass == Type::VariableArray \|\| LHSClass == Type::IncompleteArray)
	LHSClass = Type::ConstantArray;
	if (RHSClass == Type::VariableArray \|\| RHSClass == Type::IncompleteArray)
	RHSClass = Type::ConstantArray;

	// ObjCInterfaces are just specialized ObjCObjects.
	if (LHSClass == Type::ObjCInterface) LHSClass = Type::ObjCObject;
	if (RHSClass == Type::ObjCInterface) RHSClass = Type::ObjCObject;

	// Canonicalize ExtVector -> Vector.
	if (LHSClass == Type::ExtVector) LHSClass = Type::Vector;
	if (RHSClass == Type::ExtVector) RHSClass = Type::Vector;

	// If the canonical type classes don't match.
	if (LHSClass != RHSClass) {
	// Note that we only have special rules for turning block enum
	// returns into block int returns, not vice-versa.
	if (const auto *ETy = LHS->getAs<EnumType>()) {
	return mergeEnumWithInteger(*this, ETy, RHS, false);
	}
	if (const EnumType* ETy = RHS->getAs<EnumType>()) {
	return mergeEnumWithInteger(*this, ETy, LHS, BlockReturnType);
	}
	// allow block pointer type to match an 'id' type.
	if (OfBlockPointer && !BlockReturnType) {
	if (LHS->isObjCIdType() && RHS->isBlockPointerType())
	return LHS;
	if (RHS->isObjCIdType() && LHS->isBlockPointerType())
	return RHS;
	}

	return {};
	}

	// The canonical type classes match.
	switch (LHSClass) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#include "clang/AST/TypeNodes.inc"
	llvm_unreachable("Non-canonical and dependent types shouldn't get here");

	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	case Type::LValueReference:
	case Type::RValueReference:
	case Type::MemberPointer:
	llvm_unreachable("C++ should never be in mergeTypes");

	case Type::ObjCInterface:
	case Type::IncompleteArray:
	case Type::VariableArray:
	case Type::FunctionProto:
	case Type::ExtVector:
	llvm_unreachable("Types are eliminated above");

	case Type::Pointer:
	{
	// Merge two pointer types, while trying to preserve typedef info
	QualType LHSPointee = LHS->castAs<PointerType>()->getPointeeType();
	QualType RHSPointee = RHS->castAs<PointerType>()->getPointeeType();
	if (Unqualified) {
	LHSPointee = LHSPointee.getUnqualifiedType();
	RHSPointee = RHSPointee.getUnqualifiedType();
	}
	QualType ResultType = mergeTypes(LHSPointee, RHSPointee, false,
	Unqualified);
	if (ResultType.isNull())
	return {};
	if (getCanonicalType(LHSPointee) == getCanonicalType(ResultType))
	return LHS;
	if (getCanonicalType(RHSPointee) == getCanonicalType(ResultType))
	return RHS;
	return getPointerType(ResultType);
	}
	case Type::BlockPointer:
	{
	// Merge two block pointer types, while trying to preserve typedef info
	QualType LHSPointee = LHS->castAs<BlockPointerType>()->getPointeeType();
	QualType RHSPointee = RHS->castAs<BlockPointerType>()->getPointeeType();
	if (Unqualified) {
	LHSPointee = LHSPointee.getUnqualifiedType();
	RHSPointee = RHSPointee.getUnqualifiedType();
	}
	if (getLangOpts().OpenCL) {
	Qualifiers LHSPteeQual = LHSPointee.getQualifiers();
	Qualifiers RHSPteeQual = RHSPointee.getQualifiers();
	// Blocks can't be an expression in a ternary operator (OpenCL v2.0
	// 6.12.5) thus the following check is asymmetric.
	if (!LHSPteeQual.isAddressSpaceSupersetOf(RHSPteeQual))
	return {};
	LHSPteeQual.removeAddressSpace();
	RHSPteeQual.removeAddressSpace();
	LHSPointee =
	QualType(LHSPointee.getTypePtr(), LHSPteeQual.getAsOpaqueValue());
	RHSPointee =
	QualType(RHSPointee.getTypePtr(), RHSPteeQual.getAsOpaqueValue());
	}
	QualType ResultType = mergeTypes(LHSPointee, RHSPointee, OfBlockPointer,
	Unqualified);
	if (ResultType.isNull())
	return {};
	if (getCanonicalType(LHSPointee) == getCanonicalType(ResultType))
	return LHS;
	if (getCanonicalType(RHSPointee) == getCanonicalType(ResultType))
	return RHS;
	return getBlockPointerType(ResultType);
	}
	case Type::Atomic:
	{
	// Merge two pointer types, while trying to preserve typedef info
	QualType LHSValue = LHS->castAs<AtomicType>()->getValueType();
	QualType RHSValue = RHS->castAs<AtomicType>()->getValueType();
	if (Unqualified) {
	LHSValue = LHSValue.getUnqualifiedType();
	RHSValue = RHSValue.getUnqualifiedType();
	}
	QualType ResultType = mergeTypes(LHSValue, RHSValue, false,
	Unqualified);
	if (ResultType.isNull())
	return {};
	if (getCanonicalType(LHSValue) == getCanonicalType(ResultType))
	return LHS;
	if (getCanonicalType(RHSValue) == getCanonicalType(ResultType))
	return RHS;
	return getAtomicType(ResultType);
	}
	case Type::ConstantArray:
	{
	const ConstantArrayType* LCAT = getAsConstantArrayType(LHS);
	const ConstantArrayType* RCAT = getAsConstantArrayType(RHS);
	if (LCAT && RCAT && RCAT->getSize() != LCAT->getSize())
	return {};

	QualType LHSElem = getAsArrayType(LHS)->getElementType();
	QualType RHSElem = getAsArrayType(RHS)->getElementType();
	if (Unqualified) {
	LHSElem = LHSElem.getUnqualifiedType();
	RHSElem = RHSElem.getUnqualifiedType();
	}

	QualType ResultType = mergeTypes(LHSElem, RHSElem, false, Unqualified);
	if (ResultType.isNull())
	return {};

	const VariableArrayType* LVAT = getAsVariableArrayType(LHS);
	const VariableArrayType* RVAT = getAsVariableArrayType(RHS);

	// If either side is a variable array, and both are complete, check whether
	// the current dimension is definite.
	if (LVAT \|\| RVAT) {
	auto SizeFetch = [this](const VariableArrayType* VAT,
	const ConstantArrayType* CAT)
	-> std::pair<bool,llvm::APInt> {
	if (VAT) {
	llvm::APSInt TheInt;
	Expr *E = VAT->getSizeExpr();
	if (E && E->isIntegerConstantExpr(TheInt, *this))
	return std::make_pair(true, TheInt);
	else
	return std::make_pair(false, TheInt);
	} else if (CAT) {
	return std::make_pair(true, CAT->getSize());
	} else {
	return std::make_pair(false, llvm::APInt());
	}
	};

	bool HaveLSize, HaveRSize;
	llvm::APInt LSize, RSize;
	std::tie(HaveLSize, LSize) = SizeFetch(LVAT, LCAT);
	std::tie(HaveRSize, RSize) = SizeFetch(RVAT, RCAT);
	if (HaveLSize && HaveRSize && !llvm::APInt::isSameValue(LSize, RSize))
	return {}; // Definite, but unequal, array dimension
	}

	if (LCAT && getCanonicalType(LHSElem) == getCanonicalType(ResultType))
	return LHS;
	if (RCAT && getCanonicalType(RHSElem) == getCanonicalType(ResultType))
	return RHS;
	if (LCAT)
	return getConstantArrayType(ResultType, LCAT->getSize(),
	LCAT->getSizeExpr(),
	ArrayType::ArraySizeModifier(), 0);
	if (RCAT)
	return getConstantArrayType(ResultType, RCAT->getSize(),
	RCAT->getSizeExpr(),
	ArrayType::ArraySizeModifier(), 0);
	if (LVAT && getCanonicalType(LHSElem) == getCanonicalType(ResultType))
	return LHS;
	if (RVAT && getCanonicalType(RHSElem) == getCanonicalType(ResultType))
	return RHS;
	if (LVAT) {
	// FIXME: This isn't correct! But tricky to implement because
	// the array's size has to be the size of LHS, but the type
	// has to be different.
	return LHS;
	}
	if (RVAT) {
	// FIXME: This isn't correct! But tricky to implement because
	// the array's size has to be the size of RHS, but the type
	// has to be different.
	return RHS;
	}
	if (getCanonicalType(LHSElem) == getCanonicalType(ResultType)) return LHS;
	if (getCanonicalType(RHSElem) == getCanonicalType(ResultType)) return RHS;
	return getIncompleteArrayType(ResultType,
	ArrayType::ArraySizeModifier(), 0);
	}
	case Type::FunctionNoProto:
	return mergeFunctionTypes(LHS, RHS, OfBlockPointer, Unqualified);
	case Type::Record:
	case Type::Enum:
	return {};
	case Type::Builtin:
	// Only exactly equal builtin types are compatible, which is tested above.
	return {};
	case Type::Complex:
	// Distinct complex types are incompatible.
	return {};
	case Type::Vector:
	// FIXME: The merged type should be an ExtVector!
	if (areCompatVectorTypes(LHSCan->castAs<VectorType>(),
	RHSCan->castAs<VectorType>()))
	return LHS;
	return {};
	case Type::ConstantMatrix:
	if (areCompatMatrixTypes(LHSCan->castAs<ConstantMatrixType>(),
	RHSCan->castAs<ConstantMatrixType>()))
	return LHS;
	return {};
	case Type::ObjCObject: {
	// Check if the types are assignment compatible.
	// FIXME: This should be type compatibility, e.g. whether
	// "LHS x; RHS x;" at global scope is legal.
	if (canAssignObjCInterfaces(LHS->castAs<ObjCObjectType>(),
	RHS->castAs<ObjCObjectType>()))
	return LHS;
	return {};
	}
	case Type::ObjCObjectPointer:
	if (OfBlockPointer) {
	if (canAssignObjCInterfacesInBlockPointer(
	LHS->castAs<ObjCObjectPointerType>(),
	RHS->castAs<ObjCObjectPointerType>(), BlockReturnType))
	return LHS;
	return {};
	}
	if (canAssignObjCInterfaces(LHS->castAs<ObjCObjectPointerType>(),
	RHS->castAs<ObjCObjectPointerType>()))
	return LHS;
	return {};
	case Type::Pipe:
	assert(LHS != RHS &&
	"Equivalent pipe types should have already been handled!");
	return {};
	case Type::ExtInt: {
	// Merge two ext-int types, while trying to preserve typedef info.
	bool LHSUnsigned = LHS->castAs<ExtIntType>()->isUnsigned();
	bool RHSUnsigned = RHS->castAs<ExtIntType>()->isUnsigned();
	unsigned LHSBits = LHS->castAs<ExtIntType>()->getNumBits();
	unsigned RHSBits = RHS->castAs<ExtIntType>()->getNumBits();

	// Like unsigned/int, shouldn't have a type if they dont match.
	if (LHSUnsigned != RHSUnsigned)
	return {};

	if (LHSBits != RHSBits)
	return {};
	return LHS;
	}
	}

	llvm_unreachable("Invalid Type::Class!");
	}

	bool ASTContext::mergeExtParameterInfo(
	const FunctionProtoType FirstFnType, const FunctionProtoType SecondFnType,
	bool &CanUseFirst, bool &CanUseSecond,
	SmallVectorImpl<FunctionProtoType::ExtParameterInfo> &NewParamInfos) {
	assert(NewParamInfos.empty() && "param info list not empty");
	CanUseFirst = CanUseSecond = true;
	bool FirstHasInfo = FirstFnType->hasExtParameterInfos();
	bool SecondHasInfo = SecondFnType->hasExtParameterInfos();

	// Fast path: if the first type doesn't have ext parameter infos,
	// we match if and only if the second type also doesn't have them.
	if (!FirstHasInfo && !SecondHasInfo)
	return true;

	bool NeedParamInfo = false;
	size_t E = FirstHasInfo ? FirstFnType->getExtParameterInfos().size()
	: SecondFnType->getExtParameterInfos().size();

	for (size_t I = 0; I < E; ++I) {
	FunctionProtoType::ExtParameterInfo FirstParam, SecondParam;
	if (FirstHasInfo)
	FirstParam = FirstFnType->getExtParameterInfo(I);
	if (SecondHasInfo)
	SecondParam = SecondFnType->getExtParameterInfo(I);

	// Cannot merge unless everything except the noescape flag matches.
	if (FirstParam.withIsNoEscape(false) != SecondParam.withIsNoEscape(false))
	return false;

	bool FirstNoEscape = FirstParam.isNoEscape();
	bool SecondNoEscape = SecondParam.isNoEscape();
	bool IsNoEscape = FirstNoEscape && SecondNoEscape;
	NewParamInfos.push_back(FirstParam.withIsNoEscape(IsNoEscape));
	if (NewParamInfos.back().getOpaqueValue())
	NeedParamInfo = true;
	if (FirstNoEscape != IsNoEscape)
	CanUseFirst = false;
	if (SecondNoEscape != IsNoEscape)
	CanUseSecond = false;
	}

	if (!NeedParamInfo)
	NewParamInfos.clear();

	return true;
	}

	void ASTContext::ResetObjCLayout(const ObjCContainerDecl *CD) {
	ObjCLayouts[CD] = nullptr;
	}

	/// mergeObjCGCQualifiers - This routine merges ObjC's GC attribute of 'LHS' and
	/// 'RHS' attributes and returns the merged version; including for function
	/// return types.
	QualType ASTContext::mergeObjCGCQualifiers(QualType LHS, QualType RHS) {
	QualType LHSCan = getCanonicalType(LHS),
	RHSCan = getCanonicalType(RHS);
	// If two types are identical, they are compatible.
	if (LHSCan == RHSCan)
	return LHS;
	if (RHSCan->isFunctionType()) {
	if (!LHSCan->isFunctionType())
	return {};
	QualType OldReturnType =
	cast<FunctionType>(RHSCan.getTypePtr())->getReturnType();
	QualType NewReturnType =
	cast<FunctionType>(LHSCan.getTypePtr())->getReturnType();
	QualType ResReturnType =
	mergeObjCGCQualifiers(NewReturnType, OldReturnType);
	if (ResReturnType.isNull())
	return {};
	if (ResReturnType == NewReturnType \|\| ResReturnType == OldReturnType) {
	// id foo(); ... __strong id foo(); or: __strong id foo(); ... id foo();
	// In either case, use OldReturnType to build the new function type.
	const auto *F = LHS->castAs<FunctionType>();
	if (const auto *FPT = cast<FunctionProtoType>(F)) {
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.ExtInfo = getFunctionExtInfo(LHS);
	QualType ResultType =
	getFunctionType(OldReturnType, FPT->getParamTypes(), EPI);
	return ResultType;
	}
	}
	return {};
	}

	// If the qualifiers are different, the types can still be merged.
	Qualifiers LQuals = LHSCan.getLocalQualifiers();
	Qualifiers RQuals = RHSCan.getLocalQualifiers();
	if (LQuals != RQuals) {
	// If any of these qualifiers are different, we have a type mismatch.
	if (LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers() \|\|
	LQuals.getAddressSpace() != RQuals.getAddressSpace())
	return {};

	// Exactly one GC qualifier difference is allowed: __strong is
	// okay if the other type has no GC qualifier but is an Objective
	// C object pointer (i.e. implicitly strong by default). We fix
	// this by pretending that the unqualified type was actually
	// qualified __strong.
	Qualifiers::GC GC_L = LQuals.getObjCGCAttr();
	Qualifiers::GC GC_R = RQuals.getObjCGCAttr();
	assert((GC_L != GC_R) && "unequal qualifier sets had only equal elements");

	if (GC_L == Qualifiers::Weak \|\| GC_R == Qualifiers::Weak)
	return {};

	if (GC_L == Qualifiers::Strong)
	return LHS;
	if (GC_R == Qualifiers::Strong)
	return RHS;
	return {};
	}

	if (LHSCan->isObjCObjectPointerType() && RHSCan->isObjCObjectPointerType()) {
	QualType LHSBaseQT = LHS->castAs<ObjCObjectPointerType>()->getPointeeType();
	QualType RHSBaseQT = RHS->castAs<ObjCObjectPointerType>()->getPointeeType();
	QualType ResQT = mergeObjCGCQualifiers(LHSBaseQT, RHSBaseQT);
	if (ResQT == LHSBaseQT)
	return LHS;
	if (ResQT == RHSBaseQT)
	return RHS;
	}
	return {};
	}

	//===----------------------------------------------------------------------===//
	// Integer Predicates
	//===----------------------------------------------------------------------===//

	unsigned ASTContext::getIntWidth(QualType T) const {
	if (const auto *ET = T->getAs<EnumType>())
	T = ET->getDecl()->getIntegerType();
	if (T->isBooleanType())
	return 1;
	if(const auto *EIT = T->getAs<ExtIntType>())
	return EIT->getNumBits();
	// For builtin types, just use the standard type sizing method
	return (unsigned)getTypeSize(T);
	}

	QualType ASTContext::getCorrespondingUnsignedType(QualType T) const {
	assert((T->hasSignedIntegerRepresentation() \|\| T->isSignedFixedPointType()) &&
	"Unexpected type");

	// Turn <4 x signed int> -> <4 x unsigned int>
	if (const auto *VTy = T->getAs<VectorType>())
	return getVectorType(getCorrespondingUnsignedType(VTy->getElementType()),
	VTy->getNumElements(), VTy->getVectorKind());

	// For enums, we return the unsigned version of the base type.
	if (const auto *ETy = T->getAs<EnumType>())
	T = ETy->getDecl()->getIntegerType();

	switch (T->castAs<BuiltinType>()->getKind()) {
	case BuiltinType::Char_S:
	case BuiltinType::SChar:
	return UnsignedCharTy;
	case BuiltinType::Short:
	return UnsignedShortTy;
	case BuiltinType::Int:
	return UnsignedIntTy;
	case BuiltinType::Long:
	return UnsignedLongTy;
	case BuiltinType::LongLong:
	return UnsignedLongLongTy;
	case BuiltinType::Int128:
	return UnsignedInt128Ty;

	case BuiltinType::ShortAccum:
	return UnsignedShortAccumTy;
	case BuiltinType::Accum:
	return UnsignedAccumTy;
	case BuiltinType::LongAccum:
	return UnsignedLongAccumTy;
	case BuiltinType::SatShortAccum:
	return SatUnsignedShortAccumTy;
	case BuiltinType::SatAccum:
	return SatUnsignedAccumTy;
	case BuiltinType::SatLongAccum:
	return SatUnsignedLongAccumTy;
	case BuiltinType::ShortFract:
	return UnsignedShortFractTy;
	case BuiltinType::Fract:
	return UnsignedFractTy;
	case BuiltinType::LongFract:
	return UnsignedLongFractTy;
	case BuiltinType::SatShortFract:
	return SatUnsignedShortFractTy;
	case BuiltinType::SatFract:
	return SatUnsignedFractTy;
	case BuiltinType::SatLongFract:
	return SatUnsignedLongFractTy;
	default:
	llvm_unreachable("Unexpected signed integer or fixed point type");
	}
	}

	ASTMutationListener::~ASTMutationListener() = default;

	void ASTMutationListener::DeducedReturnType(const FunctionDecl *FD,
	QualType ReturnType) {}

	//===----------------------------------------------------------------------===//
	// Builtin Type Computation
	//===----------------------------------------------------------------------===//

	/// DecodeTypeFromStr - This decodes one type descriptor from Str, advancing the
	/// pointer over the consumed characters. This returns the resultant type. If
	/// AllowTypeModifiers is false then modifier like * are not parsed, just basic
	/// types. This allows "v2i*" to be parsed as a pointer to a v2i instead of
	/// a vector of "i*".
	///
	/// RequiresICE is filled in on return to indicate whether the value is required
	/// to be an Integer Constant Expression.
	static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
	ASTContext::GetBuiltinTypeError &Error,
	bool &RequiresICE,
	bool AllowTypeModifiers) {
	// Modifiers.
	int HowLong = 0;
	bool Signed = false, Unsigned = false;
	RequiresICE = false;

	// Read the prefixed modifiers first.
	bool Done = false;
	#ifndef NDEBUG
	bool IsSpecial = false;
	#endif
	while (!Done) {
	switch (*Str++) {
	default: Done = true; --Str; break;
	case 'I':
	RequiresICE = true;
	break;
	case 'S':
	assert(!Unsigned && "Can't use both 'S' and 'U' modifiers!");
	assert(!Signed && "Can't use 'S' modifier multiple times!");
	Signed = true;
	break;
	case 'U':
	assert(!Signed && "Can't use both 'S' and 'U' modifiers!");
	assert(!Unsigned && "Can't use 'U' modifier multiple times!");
	Unsigned = true;
	break;
	case 'L':
	assert(!IsSpecial && "Can't use 'L' with 'W', 'N', 'Z' or 'O' modifiers");
	assert(HowLong <= 2 && "Can't have LLLL modifier");
	++HowLong;
	break;
	case 'N':
	// 'N' behaves like 'L' for all non LP64 targets and 'int' otherwise.
	assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
	assert(HowLong == 0 && "Can't use both 'L' and 'N' modifiers!");
	#ifndef NDEBUG
	IsSpecial = true;
	#endif
	if (Context.getTargetInfo().getLongWidth() == 32)
	++HowLong;
	break;
	case 'W':
	// This modifier represents int64 type.
	assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
	assert(HowLong == 0 && "Can't use both 'L' and 'W' modifiers!");
	#ifndef NDEBUG
	IsSpecial = true;
	#endif
	switch (Context.getTargetInfo().getInt64Type()) {
	default:
	llvm_unreachable("Unexpected integer type");
	case TargetInfo::SignedLong:
	HowLong = 1;
	break;
	case TargetInfo::SignedLongLong:
	HowLong = 2;
	break;
	}
	break;
	case 'Z':
	// This modifier represents int32 type.
	assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
	assert(HowLong == 0 && "Can't use both 'L' and 'Z' modifiers!");
	#ifndef NDEBUG
	IsSpecial = true;
	#endif
	switch (Context.getTargetInfo().getIntTypeByWidth(32, true)) {
	default:
	llvm_unreachable("Unexpected integer type");
	case TargetInfo::SignedInt:
	HowLong = 0;
	break;
	case TargetInfo::SignedLong:
	HowLong = 1;
	break;
	case TargetInfo::SignedLongLong:
	HowLong = 2;
	break;
	}
	break;
	case 'O':
	assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
	assert(HowLong == 0 && "Can't use both 'L' and 'O' modifiers!");
	#ifndef NDEBUG
	IsSpecial = true;
	#endif
	if (Context.getLangOpts().OpenCL)
	HowLong = 1;
	else
	HowLong = 2;
	break;
	}
	}

	QualType Type;

	// Read the base type.
	switch (*Str++) {
	default: llvm_unreachable("Unknown builtin type letter!");
	case 'y':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'y'!");
	Type = Context.BFloat16Ty;
	break;
	case 'v':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'v'!");
	Type = Context.VoidTy;
	break;
	case 'h':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'h'!");
	Type = Context.HalfTy;
	break;
	case 'f':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'f'!");
	Type = Context.FloatTy;
	break;
	case 'd':
	assert(HowLong < 3 && !Signed && !Unsigned &&
	"Bad modifiers used with 'd'!");
	if (HowLong == 1)
	Type = Context.LongDoubleTy;
	else if (HowLong == 2)
	Type = Context.Float128Ty;
	else
	Type = Context.DoubleTy;
	break;
	case 's':
	assert(HowLong == 0 && "Bad modifiers used with 's'!");
	if (Unsigned)
	Type = Context.UnsignedShortTy;
	else
	Type = Context.ShortTy;
	break;
	case 'i':
	if (HowLong == 3)
	Type = Unsigned ? Context.UnsignedInt128Ty : Context.Int128Ty;
	else if (HowLong == 2)
	Type = Unsigned ? Context.UnsignedLongLongTy : Context.LongLongTy;
	else if (HowLong == 1)
	Type = Unsigned ? Context.UnsignedLongTy : Context.LongTy;
	else
	Type = Unsigned ? Context.UnsignedIntTy : Context.IntTy;
	break;
	case 'c':
	assert(HowLong == 0 && "Bad modifiers used with 'c'!");
	if (Signed)
	Type = Context.SignedCharTy;
	else if (Unsigned)
	Type = Context.UnsignedCharTy;
	else
	Type = Context.CharTy;
	break;
	case 'b': // boolean
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'b'!");
	Type = Context.BoolTy;
	break;
	case 'z': // size_t.
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'z'!");
	Type = Context.getSizeType();
	break;
	case 'w': // wchar_t.
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'w'!");
	Type = Context.getWideCharType();
	break;
	case 'F':
	Type = Context.getCFConstantStringType();
	break;
	case 'G':
	Type = Context.getObjCIdType();
	break;
	case 'H':
	Type = Context.getObjCSelType();
	break;
	case 'M':
	Type = Context.getObjCSuperType();
	break;
	case 'a':
	Type = Context.getBuiltinVaListType();
	assert(!Type.isNull() && "builtin va list type not initialized!");
	break;
	case 'A':
	// This is a "reference" to a va_list; however, what exactly
	// this means depends on how va_list is defined. There are two
	// different kinds of va_list: ones passed by value, and ones
	// passed by reference. An example of a by-value va_list is
	// x86, where va_list is a char*. An example of by-ref va_list
	// is x86-64, where va_list is a __va_list_tag[1]. For x86,
	// we want this argument to be a char*&; for x86-64, we want
	// it to be a __va_list_tag*.
	Type = Context.getBuiltinVaListType();
	assert(!Type.isNull() && "builtin va list type not initialized!");
	if (Type->isArrayType())
	Type = Context.getArrayDecayedType(Type);
	else
	Type = Context.getLValueReferenceType(Type);
	break;
	case 'q': {
	char *End;
	unsigned NumElements = strtoul(Str, &End, 10);
	assert(End != Str && "Missing vector size");
	Str = End;

	QualType ElementType = DecodeTypeFromStr(Str, Context, Error,
	RequiresICE, false);
	assert(!RequiresICE && "Can't require vector ICE");

	Type = Context.getScalableVectorType(ElementType, NumElements);
	break;
	}
	case 'V': {
	char *End;
	unsigned NumElements = strtoul(Str, &End, 10);
	assert(End != Str && "Missing vector size");
	Str = End;

	QualType ElementType = DecodeTypeFromStr(Str, Context, Error,
	RequiresICE, false);
	assert(!RequiresICE && "Can't require vector ICE");

	// TODO: No way to make AltiVec vectors in builtins yet.
	Type = Context.getVectorType(ElementType, NumElements,
	VectorType::GenericVector);
	break;
	}
	case 'E': {
	char *End;

	unsigned NumElements = strtoul(Str, &End, 10);
	assert(End != Str && "Missing vector size");

	Str = End;

	QualType ElementType = DecodeTypeFromStr(Str, Context, Error, RequiresICE,
	false);
	Type = Context.getExtVectorType(ElementType, NumElements);
	break;
	}
	case 'X': {
	QualType ElementType = DecodeTypeFromStr(Str, Context, Error, RequiresICE,
	false);
	assert(!RequiresICE && "Can't require complex ICE");
	Type = Context.getComplexType(ElementType);
	break;
	}
	case 'Y':
	Type = Context.getPointerDiffType();
	break;
	case 'P':
	Type = Context.getFILEType();
	if (Type.isNull()) {
	Error = ASTContext::GE_Missing_stdio;
	return {};
	}
	break;
	case 'J':
	if (Signed)
	Type = Context.getsigjmp_bufType();
	else
	Type = Context.getjmp_bufType();

	if (Type.isNull()) {
	Error = ASTContext::GE_Missing_setjmp;
	return {};
	}
	break;
	case 'K':
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'K'!");
	Type = Context.getucontext_tType();

	if (Type.isNull()) {
	Error = ASTContext::GE_Missing_ucontext;
	return {};
	}
	break;
	case 'p':
	Type = Context.getProcessIDType();
	break;
	}

	// If there are modifiers and if we're allowed to parse them, go for it.
	Done = !AllowTypeModifiers;
	while (!Done) {
	switch (char c = *Str++) {
	default: Done = true; --Str; break;
	case '*':
	case '&': {
	// Both pointers and references can have their pointee types
	// qualified with an address space.
	char *End;
	unsigned AddrSpace = strtoul(Str, &End, 10);
	if (End != Str) {
	// Note AddrSpace == 0 is not the same as an unspecified address space.
	Type = Context.getAddrSpaceQualType(
	Type,
	Context.getLangASForBuiltinAddressSpace(AddrSpace));
	Str = End;
	}
	if (c == '*')
	Type = Context.getPointerType(Type);
	else
	Type = Context.getLValueReferenceType(Type);
	break;
	}
	// FIXME: There's no way to have a built-in with an rvalue ref arg.
	case 'C':
	Type = Type.withConst();
	break;
	case 'D':
	Type = Context.getVolatileType(Type);
	break;
	case 'R':
	Type = Type.withRestrict();
	break;
	}
	}

	assert((!RequiresICE \|\| Type->isIntegralOrEnumerationType()) &&
	"Integer constant 'I' type must be an integer");

	return Type;
	}

	/// GetBuiltinType - Return the type for the specified builtin.
	QualType ASTContext::GetBuiltinType(unsigned Id,
	GetBuiltinTypeError &Error,
	unsigned *IntegerConstantArgs) const {
	const char *TypeStr = BuiltinInfo.getTypeString(Id);
	if (TypeStr[0] == '\0') {
	Error = GE_Missing_type;
	return {};
	}

	SmallVector<QualType, 8> ArgTypes;

	bool RequiresICE = false;
	Error = GE_None;
	QualType ResType = DecodeTypeFromStr(TypeStr, *this, Error,
	RequiresICE, true);
	if (Error != GE_None)
	return {};

	assert(!RequiresICE && "Result of intrinsic cannot be required to be an ICE");

	while (TypeStr[0] && TypeStr[0] != '.') {
	QualType Ty = DecodeTypeFromStr(TypeStr, *this, Error, RequiresICE, true);
	if (Error != GE_None)
	return {};

	// If this argument is required to be an IntegerConstantExpression and the
	// caller cares, fill in the bitmask we return.
	if (RequiresICE && IntegerConstantArgs)
	*IntegerConstantArgs \|= 1 << ArgTypes.size();

	// Do array -> pointer decay. The builtin should use the decayed type.
	if (Ty->isArrayType())
	Ty = getArrayDecayedType(Ty);

	ArgTypes.push_back(Ty);
	}

	if (Id == Builtin::BI__GetExceptionInfo)
	return {};

	assert((TypeStr[0] != '.' \|\| TypeStr[1] == 0) &&
	"'.' should only occur at end of builtin type list!");

	bool Variadic = (TypeStr[0] == '.');

	FunctionType::ExtInfo EI(getDefaultCallingConvention(
	Variadic, /IsCXXMethod=/false, /IsBuiltin=/true));
	if (BuiltinInfo.isNoReturn(Id)) EI = EI.withNoReturn(true);


	// We really shouldn't be making a no-proto type here.
	if (ArgTypes.empty() && Variadic && !getLangOpts().CPlusPlus)
	return getFunctionNoProtoType(ResType, EI);

	FunctionProtoType::ExtProtoInfo EPI;
	EPI.ExtInfo = EI;
	EPI.Variadic = Variadic;
	if (getLangOpts().CPlusPlus && BuiltinInfo.isNoThrow(Id))
	EPI.ExceptionSpec.Type =
	getLangOpts().CPlusPlus11 ? EST_BasicNoexcept : EST_DynamicNone;

	return getFunctionType(ResType, ArgTypes, EPI);
	}

	static GVALinkage basicGVALinkageForFunction(const ASTContext &Context,
	const FunctionDecl *FD) {
	if (!FD->isExternallyVisible())
	return GVA_Internal;

	// Non-user-provided functions get emitted as weak definitions with every
	// use, no matter whether they've been explicitly instantiated etc.
	if (const auto *MD = dyn_cast<CXXMethodDecl>(FD))
	if (!MD->isUserProvided())
	return GVA_DiscardableODR;

	GVALinkage External;
	switch (FD->getTemplateSpecializationKind()) {
	case TSK_Undeclared:
	case TSK_ExplicitSpecialization:
	External = GVA_StrongExternal;
	break;

	case TSK_ExplicitInstantiationDefinition:
	return GVA_StrongODR;

	// C++11 [temp.explicit]p10:
	// [ Note: The intent is that an inline function that is the subject of
	// an explicit instantiation declaration will still be implicitly
	// instantiated when used so that the body can be considered for
	// inlining, but that no out-of-line copy of the inline function would be
	// generated in the translation unit. -- end note ]
	case TSK_ExplicitInstantiationDeclaration:
	return GVA_AvailableExternally;

	case TSK_ImplicitInstantiation:
	External = GVA_DiscardableODR;
	break;
	}

	if (!FD->isInlined())
	return External;

	if ((!Context.getLangOpts().CPlusPlus &&
	!Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	!FD->hasAttr<DLLExportAttr>()) \|\|
	FD->hasAttr<GNUInlineAttr>()) {
	// FIXME: This doesn't match gcc's behavior for dllexport inline functions.

	// GNU or C99 inline semantics. Determine whether this symbol should be
	// externally visible.
	if (FD->isInlineDefinitionExternallyVisible())
	return External;

	// C99 inline semantics, where the symbol is not externally visible.
	return GVA_AvailableExternally;
	}

	// Functions specified with extern and inline in -fms-compatibility mode
	// forcibly get emitted. While the body of the function cannot be later
	// replaced, the function definition cannot be discarded.
	if (FD->isMSExternInline())
	return GVA_StrongODR;

	return GVA_DiscardableODR;
	}

	static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
	const Decl *D, GVALinkage L) {
	// See http://msdn.microsoft.com/en-us/library/xa0d9ste.aspx
	// dllexport/dllimport on inline functions.
	if (D->hasAttr<DLLImportAttr>()) {
	if (L == GVA_DiscardableODR \|\| L == GVA_StrongODR)
	return GVA_AvailableExternally;
	} else if (D->hasAttr<DLLExportAttr>()) {
	if (L == GVA_DiscardableODR)
	return GVA_StrongODR;
	} else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice &&
	D->hasAttr<CUDAGlobalAttr>()) {
	// Device-side functions with __global__ attribute must always be
	// visible externally so they can be launched from host.
	if (L == GVA_DiscardableODR \|\| L == GVA_Internal)
	return GVA_StrongODR;
	}
	return L;
	}

	/// Adjust the GVALinkage for a declaration based on what an external AST source
	/// knows about whether there can be other definitions of this declaration.
	static GVALinkage
	adjustGVALinkageForExternalDefinitionKind(const ASTContext &Ctx, const Decl *D,
	GVALinkage L) {
	ExternalASTSource *Source = Ctx.getExternalSource();
	if (!Source)
	return L;

	switch (Source->hasExternalDefinitions(D)) {
	case ExternalASTSource::EK_Never:
	// Other translation units rely on us to provide the definition.
	if (L == GVA_DiscardableODR)
	return GVA_StrongODR;
	break;

	case ExternalASTSource::EK_Always:
	return GVA_AvailableExternally;

	case ExternalASTSource::EK_ReplyHazy:
	break;
	}
	return L;
	}

	GVALinkage ASTContext::GetGVALinkageForFunction(const FunctionDecl *FD) const {
	return adjustGVALinkageForExternalDefinitionKind(*this, FD,
	adjustGVALinkageForAttributes(*this, FD,
	basicGVALinkageForFunction(*this, FD)));
	}

	static GVALinkage basicGVALinkageForVariable(const ASTContext &Context,
	const VarDecl *VD) {
	if (!VD->isExternallyVisible())
	return GVA_Internal;

	if (VD->isStaticLocal()) {
	const DeclContext *LexicalContext = VD->getParentFunctionOrMethod();
	while (LexicalContext && !isa<FunctionDecl>(LexicalContext))
	LexicalContext = LexicalContext->getLexicalParent();

	// ObjC Blocks can create local variables that don't have a FunctionDecl
	// LexicalContext.
	if (!LexicalContext)
	return GVA_DiscardableODR;

	// Otherwise, let the static local variable inherit its linkage from the
	// nearest enclosing function.
	auto StaticLocalLinkage =
	Context.GetGVALinkageForFunction(cast<FunctionDecl>(LexicalContext));

	// Itanium ABI 5.2.2: "Each COMDAT group [for a static local variable] must
	// be emitted in any object with references to the symbol for the object it
	// contains, whether inline or out-of-line."
	// Similar behavior is observed with MSVC. An alternative ABI could use
	// StrongODR/AvailableExternally to match the function, but none are
	// known/supported currently.
	if (StaticLocalLinkage == GVA_StrongODR \|\|
	StaticLocalLinkage == GVA_AvailableExternally)
	return GVA_DiscardableODR;
	return StaticLocalLinkage;
	}

	// MSVC treats in-class initialized static data members as definitions.
	// By giving them non-strong linkage, out-of-line definitions won't
	// cause link errors.
	if (Context.isMSStaticDataMemberInlineDefinition(VD))
	return GVA_DiscardableODR;

	// Most non-template variables have strong linkage; inline variables are
	// linkonce_odr or (occasionally, for compatibility) weak_odr.
	GVALinkage StrongLinkage;
	switch (Context.getInlineVariableDefinitionKind(VD)) {
	case ASTContext::InlineVariableDefinitionKind::None:
	StrongLinkage = GVA_StrongExternal;
	break;
	case ASTContext::InlineVariableDefinitionKind::Weak:
	case ASTContext::InlineVariableDefinitionKind::WeakUnknown:
	StrongLinkage = GVA_DiscardableODR;
	break;
	case ASTContext::InlineVariableDefinitionKind::Strong:
	StrongLinkage = GVA_StrongODR;
	break;
	}

	switch (VD->getTemplateSpecializationKind()) {
	case TSK_Undeclared:
	return StrongLinkage;

	case TSK_ExplicitSpecialization:
	return Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	VD->isStaticDataMember()
	? GVA_StrongODR
	: StrongLinkage;

	case TSK_ExplicitInstantiationDefinition:
	return GVA_StrongODR;

	case TSK_ExplicitInstantiationDeclaration:
	return GVA_AvailableExternally;

	case TSK_ImplicitInstantiation:
	return GVA_DiscardableODR;
	}

	llvm_unreachable("Invalid Linkage!");
	}

	GVALinkage ASTContext::GetGVALinkageForVariable(const VarDecl *VD) {
	return adjustGVALinkageForExternalDefinitionKind(*this, VD,
	adjustGVALinkageForAttributes(*this, VD,
	basicGVALinkageForVariable(*this, VD)));
	}

	bool ASTContext::DeclMustBeEmitted(const Decl *D) {
	if (const auto *VD = dyn_cast<VarDecl>(D)) {
	if (!VD->isFileVarDecl())
	return false;
	// Global named register variables (GNU extension) are never emitted.
	if (VD->getStorageClass() == SC_Register)
	return false;
	if (VD->getDescribedVarTemplate() \|\|
	isa<VarTemplatePartialSpecializationDecl>(VD))
	return false;
	} else if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
	// We never need to emit an uninstantiated function template.
	if (FD->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate)
	return false;
	} else if (isa<PragmaCommentDecl>(D))
	return true;
	else if (isa<PragmaDetectMismatchDecl>(D))
	return true;
	else if (isa<OMPRequiresDecl>(D))
	return true;
	else if (isa<OMPThreadPrivateDecl>(D))
	return !D->getDeclContext()->isDependentContext();
	else if (isa<OMPAllocateDecl>(D))
	return !D->getDeclContext()->isDependentContext();
	else if (isa<OMPDeclareReductionDecl>(D) \|\| isa<OMPDeclareMapperDecl>(D))
	return !D->getDeclContext()->isDependentContext();
	else if (isa<ImportDecl>(D))
	return true;
	else
	return false;

	if (D->isFromASTFile() && !LangOpts.BuildingPCHWithObjectFile) {
	assert(getExternalSource() && "It's from an AST file; must have a source.");
	// On Windows, PCH files are built together with an object file. If this
	// declaration comes from such a PCH and DeclMustBeEmitted would return
	// true, it would have returned true and the decl would have been emitted
	// into that object file, so it doesn't need to be emitted here.
	// Note that decls are still emitted if they're referenced, as usual;
	// DeclMustBeEmitted is used to decide whether a decl must be emitted even
	// if it's not referenced.
	//
	// Explicit template instantiation definitions are tricky. If there was an
	// explicit template instantiation decl in the PCH before, it will look like
	// the definition comes from there, even if that was just the declaration.
	// (Explicit instantiation defs of variable templates always get emitted.)
	bool IsExpInstDef =
	isa<FunctionDecl>(D) &&
	cast<FunctionDecl>(D)->getTemplateSpecializationKind() ==
	TSK_ExplicitInstantiationDefinition;

	// Implicit member function definitions, such as operator= might not be
	// marked as template specializations, since they're not coming from a
	// template but synthesized directly on the class.
	IsExpInstDef \|=
	isa<CXXMethodDecl>(D) &&
	cast<CXXMethodDecl>(D)->getParent()->getTemplateSpecializationKind() ==
	TSK_ExplicitInstantiationDefinition;

	if (getExternalSource()->DeclIsFromPCHWithObjectFile(D) && !IsExpInstDef)
	return false;
	}

	// If this is a member of a class template, we do not need to emit it.
	if (D->getDeclContext()->isDependentContext())
	return false;

	// Weak references don't produce any output by themselves.
	if (D->hasAttr<WeakRefAttr>())
	return false;

	// Aliases and used decls are required.
	if (D->hasAttr<AliasAttr>() \|\| D->hasAttr<UsedAttr>())
	return true;

	if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
	// Forward declarations aren't required.
	if (!FD->doesThisDeclarationHaveABody())
	return FD->doesDeclarationForceExternallyVisibleDefinition();

	// Constructors and destructors are required.
	if (FD->hasAttr<ConstructorAttr>() \|\| FD->hasAttr<DestructorAttr>())
	return true;

	// The key function for a class is required. This rule only comes
	// into play when inline functions can be key functions, though.
	if (getTargetInfo().getCXXABI().canKeyFunctionBeInline()) {
	if (const auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
	const CXXRecordDecl *RD = MD->getParent();
	if (MD->isOutOfLine() && RD->isDynamicClass()) {
	const CXXMethodDecl *KeyFunc = getCurrentKeyFunction(RD);
	if (KeyFunc && KeyFunc->getCanonicalDecl() == MD->getCanonicalDecl())
	return true;
	}
	}
	}

	GVALinkage Linkage = GetGVALinkageForFunction(FD);

	// static, static inline, always_inline, and extern inline functions can
	// always be deferred. Normal inline functions can be deferred in C99/C++.
	// Implicit template instantiations can also be deferred in C++.
	return !isDiscardableGVALinkage(Linkage);
	}

	const auto *VD = cast<VarDecl>(D);
	assert(VD->isFileVarDecl() && "Expected file scoped var");

	// If the decl is marked as `declare target to`, it should be emitted for the
	// host and for the device.
	if (LangOpts.OpenMP &&
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
	return true;

	if (VD->isThisDeclarationADefinition() == VarDecl::DeclarationOnly &&
	!isMSStaticDataMemberInlineDefinition(VD))
	return false;

	// Variables that can be needed in other TUs are required.
	auto Linkage = GetGVALinkageForVariable(VD);
	if (!isDiscardableGVALinkage(Linkage))
	return true;

	// We never need to emit a variable that is available in another TU.
	if (Linkage == GVA_AvailableExternally)
	return false;

	// Variables that have destruction with side-effects are required.
	if (VD->needsDestruction(*this))
	return true;

	// Variables that have initialization with side-effects are required.
	if (VD->getInit() && VD->getInit()->HasSideEffects(*this) &&
	// We can get a value-dependent initializer during error recovery.
	(VD->getInit()->isValueDependent() \|\| !VD->evaluateValue()))
	return true;

	// Likewise, variables with tuple-like bindings are required if their
	// bindings have side-effects.
	if (const auto *DD = dyn_cast<DecompositionDecl>(VD))
	for (const auto *BD : DD->bindings())
	if (const auto *BindingVD = BD->getHoldingVar())
	if (DeclMustBeEmitted(BindingVD))
	return true;

	return false;
	}

	void ASTContext::forEachMultiversionedFunctionVersion(
	const FunctionDecl *FD,
	llvm::function_ref<void(FunctionDecl *)> Pred) const {
	assert(FD->isMultiVersion() && "Only valid for multiversioned functions");
	llvm::SmallDenseSet<const FunctionDecl*, 4> SeenDecls;
	FD = FD->getMostRecentDecl();
	for (auto *CurDecl :
	FD->getDeclContext()->getRedeclContext()->lookup(FD->getDeclName())) {
	FunctionDecl *CurFD = CurDecl->getAsFunction()->getMostRecentDecl();
	if (CurFD && hasSameType(CurFD->getType(), FD->getType()) &&
	std::end(SeenDecls) == llvm::find(SeenDecls, CurFD)) {
	SeenDecls.insert(CurFD);
	Pred(CurFD);
	}
	}
	}

	CallingConv ASTContext::getDefaultCallingConvention(bool IsVariadic,
	bool IsCXXMethod,
	bool IsBuiltin) const {
	// Pass through to the C++ ABI object
	if (IsCXXMethod)
	return ABI->getDefaultMethodCallConv(IsVariadic);

	// Builtins ignore user-specified default calling convention and remain the
	// Target's default calling convention.
	if (!IsBuiltin) {
	switch (LangOpts.getDefaultCallingConv()) {
	case LangOptions::DCC_None:
	break;
	case LangOptions::DCC_CDecl:
	return CC_C;
	case LangOptions::DCC_FastCall:
	if (getTargetInfo().hasFeature("sse2") && !IsVariadic)
	return CC_X86FastCall;
	break;
	case LangOptions::DCC_StdCall:
	if (!IsVariadic)
	return CC_X86StdCall;
	break;
	case LangOptions::DCC_VectorCall:
	// __vectorcall cannot be applied to variadic functions.
	if (!IsVariadic)
	return CC_X86VectorCall;
	break;
	case LangOptions::DCC_RegCall:
	// __regcall cannot be applied to variadic functions.
	if (!IsVariadic)
	return CC_X86RegCall;
	break;
	}
	}
	return Target->getDefaultCallingConv();
	}

	bool ASTContext::isNearlyEmpty(const CXXRecordDecl *RD) const {
	// Pass through to the C++ ABI object
	return ABI->isNearlyEmpty(RD);
	}

	VTableContextBase *ASTContext::getVTableContext() {
	if (!VTContext.get()) {
	auto ABI = Target->getCXXABI();
	if (ABI.isMicrosoft())
	VTContext.reset(new MicrosoftVTableContext(*this));
	else {
	auto ComponentLayout = getLangOpts().RelativeCXXABIVTables
	? ItaniumVTableContext::Relative
	: ItaniumVTableContext::Pointer;
	VTContext.reset(new ItaniumVTableContext(*this, ComponentLayout));
	}
	}
	return VTContext.get();
	}

	MangleContext ASTContext::createMangleContext(const TargetInfo T) {
	if (!T)
	T = Target;
	switch (T->getCXXABI().getKind()) {
	case TargetCXXABI::Fuchsia:
	case TargetCXXABI::GenericAArch64:
	case TargetCXXABI::GenericItanium:
	case TargetCXXABI::GenericARM:
	case TargetCXXABI::GenericMIPS:
	case TargetCXXABI::iOS:
	case TargetCXXABI::iOS64:
	case TargetCXXABI::WebAssembly:
	case TargetCXXABI::WatchOS:
	case TargetCXXABI::XL:
	return ItaniumMangleContext::create(*this, getDiagnostics());
	case TargetCXXABI::Microsoft:
	return MicrosoftMangleContext::create(*this, getDiagnostics());
	}
	llvm_unreachable("Unsupported ABI");
	}

	CXXABI::~CXXABI() = default;

	size_t ASTContext::getSideTableAllocatedMemory() const {
	return ASTRecordLayouts.getMemorySize() +
	llvm::capacity_in_bytes(ObjCLayouts) +
	llvm::capacity_in_bytes(KeyFunctions) +
	llvm::capacity_in_bytes(ObjCImpls) +
	llvm::capacity_in_bytes(BlockVarCopyInits) +
	llvm::capacity_in_bytes(DeclAttrs) +
	llvm::capacity_in_bytes(TemplateOrInstantiation) +
	llvm::capacity_in_bytes(InstantiatedFromUsingDecl) +
	llvm::capacity_in_bytes(InstantiatedFromUsingShadowDecl) +
	llvm::capacity_in_bytes(InstantiatedFromUnnamedFieldDecl) +
	llvm::capacity_in_bytes(OverriddenMethods) +
	llvm::capacity_in_bytes(Types) +
	llvm::capacity_in_bytes(VariableArrayTypes);
	}

	/// getIntTypeForBitwidth -
	/// sets integer QualTy according to specified details:
	/// bitwidth, signed/unsigned.
	/// Returns empty type if there is no appropriate target types.
	QualType ASTContext::getIntTypeForBitwidth(unsigned DestWidth,
	unsigned Signed) const {
	TargetInfo::IntType Ty = getTargetInfo().getIntTypeByWidth(DestWidth, Signed);
	CanQualType QualTy = getFromTargetType(Ty);
	if (!QualTy && DestWidth == 128)
	return Signed ? Int128Ty : UnsignedInt128Ty;
	return QualTy;
	}

	/// getRealTypeForBitwidth -
	/// sets floating point QualTy according to specified bitwidth.
	/// Returns empty type if there is no appropriate target types.
	QualType ASTContext::getRealTypeForBitwidth(unsigned DestWidth,
	bool ExplicitIEEE) const {
	TargetInfo::RealType Ty =
	getTargetInfo().getRealTypeByWidth(DestWidth, ExplicitIEEE);
	switch (Ty) {
	case TargetInfo::Float:
	return FloatTy;
	case TargetInfo::Double:
	return DoubleTy;
	case TargetInfo::LongDouble:
	return LongDoubleTy;
	case TargetInfo::Float128:
	return Float128Ty;
	case TargetInfo::NoFloat:
	return {};
	}

	llvm_unreachable("Unhandled TargetInfo::RealType value");
	}

	void ASTContext::setManglingNumber(const NamedDecl *ND, unsigned Number) {
	if (Number > 1)
	MangleNumbers[ND] = Number;
	}

	unsigned ASTContext::getManglingNumber(const NamedDecl *ND) const {
	auto I = MangleNumbers.find(ND);
	return I != MangleNumbers.end() ? I->second : 1;
	}

	void ASTContext::setStaticLocalNumber(const VarDecl *VD, unsigned Number) {
	if (Number > 1)
	StaticLocalNumbers[VD] = Number;
	}

	unsigned ASTContext::getStaticLocalNumber(const VarDecl *VD) const {
	auto I = StaticLocalNumbers.find(VD);
	return I != StaticLocalNumbers.end() ? I->second : 1;
	}

	MangleNumberingContext &
	ASTContext::getManglingNumberContext(const DeclContext *DC) {
	assert(LangOpts.CPlusPlus); // We don't need mangling numbers for plain C.
	std::unique_ptr<MangleNumberingContext> &MCtx = MangleNumberingContexts[DC];
	if (!MCtx)
	MCtx = createMangleNumberingContext();
	return *MCtx;
	}

	MangleNumberingContext &
	ASTContext::getManglingNumberContext(NeedExtraManglingDecl_t, const Decl *D) {
	assert(LangOpts.CPlusPlus); // We don't need mangling numbers for plain C.
	std::unique_ptr<MangleNumberingContext> &MCtx =
	ExtraMangleNumberingContexts[D];
	if (!MCtx)
	MCtx = createMangleNumberingContext();
	return *MCtx;
	}

	std::unique_ptr<MangleNumberingContext>
	ASTContext::createMangleNumberingContext() const {
	return ABI->createMangleNumberingContext();
	}

	const CXXConstructorDecl *
	ASTContext::getCopyConstructorForExceptionObject(CXXRecordDecl *RD) {
	return ABI->getCopyConstructorForExceptionObject(
	cast<CXXRecordDecl>(RD->getFirstDecl()));
	}

	void ASTContext::addCopyConstructorForExceptionObject(CXXRecordDecl *RD,
	CXXConstructorDecl *CD) {
	return ABI->addCopyConstructorForExceptionObject(
	cast<CXXRecordDecl>(RD->getFirstDecl()),
	cast<CXXConstructorDecl>(CD->getFirstDecl()));
	}

	void ASTContext::addTypedefNameForUnnamedTagDecl(TagDecl *TD,
	TypedefNameDecl *DD) {
	return ABI->addTypedefNameForUnnamedTagDecl(TD, DD);
	}

	TypedefNameDecl *
	ASTContext::getTypedefNameForUnnamedTagDecl(const TagDecl *TD) {
	return ABI->getTypedefNameForUnnamedTagDecl(TD);
	}

	void ASTContext::addDeclaratorForUnnamedTagDecl(TagDecl *TD,
	DeclaratorDecl *DD) {
	return ABI->addDeclaratorForUnnamedTagDecl(TD, DD);
	}

	DeclaratorDecl ASTContext::getDeclaratorForUnnamedTagDecl(const TagDecl TD) {
	return ABI->getDeclaratorForUnnamedTagDecl(TD);
	}

	void ASTContext::setParameterIndex(const ParmVarDecl *D, unsigned int index) {
	ParamIndices[D] = index;
	}

	unsigned ASTContext::getParameterIndex(const ParmVarDecl *D) const {
	ParameterIndexTable::const_iterator I = ParamIndices.find(D);
	assert(I != ParamIndices.end() &&
	"ParmIndices lacks entry set by ParmVarDecl");
	return I->second;
	}

	QualType ASTContext::getStringLiteralArrayType(QualType EltTy,
	unsigned Length) const {
	// A C++ string literal has a const-qualified element type (C++ 2.13.4p1).
	if (getLangOpts().CPlusPlus \|\| getLangOpts().ConstStrings)
	EltTy = EltTy.withConst();

	EltTy = adjustStringLiteralBaseType(EltTy);

	// Get an array type for the string, according to C99 6.4.5. This includes
	// the null terminator character.
	return getConstantArrayType(EltTy, llvm::APInt(32, Length + 1), nullptr,
	ArrayType::Normal, /IndexTypeQuals/ 0);
	}

	StringLiteral *
	ASTContext::getPredefinedStringLiteralFromCache(StringRef Key) const {
	StringLiteral *&Result = StringLiteralCache[Key];
	if (!Result)
	Result = StringLiteral::Create(
	*this, Key, StringLiteral::Ascii,
	/Pascal/ false, getStringLiteralArrayType(CharTy, Key.size()),
	SourceLocation());
	return Result;
	}

	MSGuidDecl *
	ASTContext::getMSGuidDecl(MSGuidDecl::Parts Parts) const {
	assert(MSGuidTagDecl && "building MS GUID without MS extensions?");

	llvm::FoldingSetNodeID ID;
	MSGuidDecl::Profile(ID, Parts);

	void *InsertPos;
	if (MSGuidDecl *Existing = MSGuidDecls.FindNodeOrInsertPos(ID, InsertPos))
	return Existing;

	QualType GUIDType = getMSGuidType().withConst();
	MSGuidDecl New = MSGuidDecl::Create(this, GUIDType, Parts);
	MSGuidDecls.InsertNode(New, InsertPos);
	return New;
	}

	bool ASTContext::AtomicUsesUnsupportedLibcall(const AtomicExpr *E) const {
	const llvm::Triple &T = getTargetInfo().getTriple();
	if (!T.isOSDarwin())
	return false;

	if (!(T.isiOS() && T.isOSVersionLT(7)) &&
	!(T.isMacOSX() && T.isOSVersionLT(10, 9)))
	return false;

	QualType AtomicTy = E->getPtr()->getType()->getPointeeType();
	CharUnits sizeChars = getTypeSizeInChars(AtomicTy);
	uint64_t Size = sizeChars.getQuantity();
	CharUnits alignChars = getTypeAlignInChars(AtomicTy);
	unsigned Align = alignChars.getQuantity();
	unsigned MaxInlineWidthInBits = getTargetInfo().getMaxAtomicInlineWidth();
	return (Size != Align \|\| toBits(sizeChars) > MaxInlineWidthInBits);
	}

	bool
	ASTContext::ObjCMethodsAreEqual(const ObjCMethodDecl *MethodDecl,
	const ObjCMethodDecl *MethodImpl) {
	// No point trying to match an unavailable/deprecated mothod.
	if (MethodDecl->hasAttr<UnavailableAttr>()
	\|\| MethodDecl->hasAttr<DeprecatedAttr>())
	return false;
	if (MethodDecl->getObjCDeclQualifier() !=
	MethodImpl->getObjCDeclQualifier())
	return false;
	if (!hasSameType(MethodDecl->getReturnType(), MethodImpl->getReturnType()))
	return false;

	if (MethodDecl->param_size() != MethodImpl->param_size())
	return false;

	for (ObjCMethodDecl::param_const_iterator IM = MethodImpl->param_begin(),
	IF = MethodDecl->param_begin(), EM = MethodImpl->param_end(),
	EF = MethodDecl->param_end();
	IM != EM && IF != EF; ++IM, ++IF) {
	const ParmVarDecl DeclVar = (IF);
	const ParmVarDecl ImplVar = (IM);
	if (ImplVar->getObjCDeclQualifier() != DeclVar->getObjCDeclQualifier())
	return false;
	if (!hasSameType(DeclVar->getType(), ImplVar->getType()))
	return false;
	}

	return (MethodDecl->isVariadic() == MethodImpl->isVariadic());
	}

	uint64_t ASTContext::getTargetNullPointerValue(QualType QT) const {
	LangAS AS;
	if (QT->getUnqualifiedDesugaredType()->isNullPtrType())
	AS = LangAS::Default;
	else
	AS = QT->getPointeeType().getAddressSpace();

	return getTargetInfo().getNullPointerValue(AS);
	}

	unsigned ASTContext::getTargetAddressSpace(LangAS AS) const {
	if (isTargetAddressSpace(AS))
	return toTargetAddressSpace(AS);
	else
	return (*AddrSpaceMap)[(unsigned)AS];
	}

	QualType ASTContext::getCorrespondingSaturatedType(QualType Ty) const {
	assert(Ty->isFixedPointType());

	if (Ty->isSaturatedFixedPointType()) return Ty;

	switch (Ty->castAs<BuiltinType>()->getKind()) {
	default:
	llvm_unreachable("Not a fixed point type!");
	case BuiltinType::ShortAccum:
	return SatShortAccumTy;
	case BuiltinType::Accum:
	return SatAccumTy;
	case BuiltinType::LongAccum:
	return SatLongAccumTy;
	case BuiltinType::UShortAccum:
	return SatUnsignedShortAccumTy;
	case BuiltinType::UAccum:
	return SatUnsignedAccumTy;
	case BuiltinType::ULongAccum:
	return SatUnsignedLongAccumTy;
	case BuiltinType::ShortFract:
	return SatShortFractTy;
	case BuiltinType::Fract:
	return SatFractTy;
	case BuiltinType::LongFract:
	return SatLongFractTy;
	case BuiltinType::UShortFract:
	return SatUnsignedShortFractTy;
	case BuiltinType::UFract:
	return SatUnsignedFractTy;
	case BuiltinType::ULongFract:
	return SatUnsignedLongFractTy;
	}
	}

	LangAS ASTContext::getLangASForBuiltinAddressSpace(unsigned AS) const {
	if (LangOpts.OpenCL)
	return getTargetInfo().getOpenCLBuiltinAddressSpace(AS);

	if (LangOpts.CUDA)
	return getTargetInfo().getCUDABuiltinAddressSpace(AS);

	return getLangASFromTargetAS(AS);
	}

	// Explicitly instantiate this in case a Redeclarable<T> is used from a TU that
	// doesn't include ASTContext.h
	template
	clang::LazyGenerationalUpdatePtr<
	const Decl , Decl , &ExternalASTSource::CompleteRedeclChain>::ValueType
	clang::LazyGenerationalUpdatePtr<
	const Decl , Decl , &ExternalASTSource::CompleteRedeclChain>::makeValue(
	const clang::ASTContext &Ctx, Decl *Value);

	unsigned char ASTContext::getFixedPointScale(QualType Ty) const {
	assert(Ty->isFixedPointType());

	const TargetInfo &Target = getTargetInfo();
	switch (Ty->castAs<BuiltinType>()->getKind()) {
	default:
	llvm_unreachable("Not a fixed point type!");
	case BuiltinType::ShortAccum:
	case BuiltinType::SatShortAccum:
	return Target.getShortAccumScale();
	case BuiltinType::Accum:
	case BuiltinType::SatAccum:
	return Target.getAccumScale();
	case BuiltinType::LongAccum:
	case BuiltinType::SatLongAccum:
	return Target.getLongAccumScale();
	case BuiltinType::UShortAccum:
	case BuiltinType::SatUShortAccum:
	return Target.getUnsignedShortAccumScale();
	case BuiltinType::UAccum:
	case BuiltinType::SatUAccum:
	return Target.getUnsignedAccumScale();
	case BuiltinType::ULongAccum:
	case BuiltinType::SatULongAccum:
	return Target.getUnsignedLongAccumScale();
	case BuiltinType::ShortFract:
	case BuiltinType::SatShortFract:
	return Target.getShortFractScale();
	case BuiltinType::Fract:
	case BuiltinType::SatFract:
	return Target.getFractScale();
	case BuiltinType::LongFract:
	case BuiltinType::SatLongFract:
	return Target.getLongFractScale();
	case BuiltinType::UShortFract:
	case BuiltinType::SatUShortFract:
	return Target.getUnsignedShortFractScale();
	case BuiltinType::UFract:
	case BuiltinType::SatUFract:
	return Target.getUnsignedFractScale();
	case BuiltinType::ULongFract:
	case BuiltinType::SatULongFract:
	return Target.getUnsignedLongFractScale();
	}
	}

	unsigned char ASTContext::getFixedPointIBits(QualType Ty) const {
	assert(Ty->isFixedPointType());

	const TargetInfo &Target = getTargetInfo();
	switch (Ty->castAs<BuiltinType>()->getKind()) {
	default:
	llvm_unreachable("Not a fixed point type!");
	case BuiltinType::ShortAccum:
	case BuiltinType::SatShortAccum:
	return Target.getShortAccumIBits();
	case BuiltinType::Accum:
	case BuiltinType::SatAccum:
	return Target.getAccumIBits();
	case BuiltinType::LongAccum:
	case BuiltinType::SatLongAccum:
	return Target.getLongAccumIBits();
	case BuiltinType::UShortAccum:
	case BuiltinType::SatUShortAccum:
	return Target.getUnsignedShortAccumIBits();
	case BuiltinType::UAccum:
	case BuiltinType::SatUAccum:
	return Target.getUnsignedAccumIBits();
	case BuiltinType::ULongAccum:
	case BuiltinType::SatULongAccum:
	return Target.getUnsignedLongAccumIBits();
	case BuiltinType::ShortFract:
	case BuiltinType::SatShortFract:
	case BuiltinType::Fract:
	case BuiltinType::SatFract:
	case BuiltinType::LongFract:
	case BuiltinType::SatLongFract:
	case BuiltinType::UShortFract:
	case BuiltinType::SatUShortFract:
	case BuiltinType::UFract:
	case BuiltinType::SatUFract:
	case BuiltinType::ULongFract:
	case BuiltinType::SatULongFract:
	return 0;
	}
	}

	FixedPointSemantics ASTContext::getFixedPointSemantics(QualType Ty) const {
	assert((Ty->isFixedPointType() \|\| Ty->isIntegerType()) &&
	"Can only get the fixed point semantics for a "
	"fixed point or integer type.");
	if (Ty->isIntegerType())
	return FixedPointSemantics::GetIntegerSemantics(getIntWidth(Ty),
	Ty->isSignedIntegerType());

	bool isSigned = Ty->isSignedFixedPointType();
	return FixedPointSemantics(
	static_cast<unsigned>(getTypeSize(Ty)), getFixedPointScale(Ty), isSigned,
	Ty->isSaturatedFixedPointType(),
	!isSigned && getTargetInfo().doUnsignedFixedPointTypesHavePadding());
	}

	APFixedPoint ASTContext::getFixedPointMax(QualType Ty) const {
	assert(Ty->isFixedPointType());
	return APFixedPoint::getMax(getFixedPointSemantics(Ty));
	}

	APFixedPoint ASTContext::getFixedPointMin(QualType Ty) const {
	assert(Ty->isFixedPointType());
	return APFixedPoint::getMin(getFixedPointSemantics(Ty));
	}

	QualType ASTContext::getCorrespondingSignedFixedPointType(QualType Ty) const {
	assert(Ty->isUnsignedFixedPointType() &&
	"Expected unsigned fixed point type");

	switch (Ty->castAs<BuiltinType>()->getKind()) {
	case BuiltinType::UShortAccum:
	return ShortAccumTy;
	case BuiltinType::UAccum:
	return AccumTy;
	case BuiltinType::ULongAccum:
	return LongAccumTy;
	case BuiltinType::SatUShortAccum:
	return SatShortAccumTy;
	case BuiltinType::SatUAccum:
	return SatAccumTy;
	case BuiltinType::SatULongAccum:
	return SatLongAccumTy;
	case BuiltinType::UShortFract:
	return ShortFractTy;
	case BuiltinType::UFract:
	return FractTy;
	case BuiltinType::ULongFract:
	return LongFractTy;
	case BuiltinType::SatUShortFract:
	return SatShortFractTy;
	case BuiltinType::SatUFract:
	return SatFractTy;
	case BuiltinType::SatULongFract:
	return SatLongFractTy;
	default:
	llvm_unreachable("Unexpected unsigned fixed point type");
	}
	}

	ParsedTargetAttr
	ASTContext::filterFunctionTargetAttrs(const TargetAttr *TD) const {
	assert(TD != nullptr);
	ParsedTargetAttr ParsedAttr = TD->parse();

	ParsedAttr.Features.erase(
	llvm::remove_if(ParsedAttr.Features,
	[&](const std::string &Feat) {
	return !Target->isValidFeatureName(
	StringRef{Feat}.substr(1));
	}),
	ParsedAttr.Features.end());
	return ParsedAttr;
	}

	void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
	const FunctionDecl *FD) const {
	if (FD)
	getFunctionFeatureMap(FeatureMap, GlobalDecl().getWithDecl(FD));
	else
	Target->initFeatureMap(FeatureMap, getDiagnostics(),
	Target->getTargetOpts().CPU,
	Target->getTargetOpts().Features);
	}

	// Fills in the supplied string map with the set of target features for the
	// passed in function.
	void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
	GlobalDecl GD) const {
	StringRef TargetCPU = Target->getTargetOpts().CPU;
	const FunctionDecl *FD = GD.getDecl()->getAsFunction();
	if (const auto *TD = FD->getAttr<TargetAttr>()) {
	ParsedTargetAttr ParsedAttr = filterFunctionTargetAttrs(TD);

	// Make a copy of the features as passed on the command line into the
	// beginning of the additional features from the function to override.
	ParsedAttr.Features.insert(
	ParsedAttr.Features.begin(),
	Target->getTargetOpts().FeaturesAsWritten.begin(),
	Target->getTargetOpts().FeaturesAsWritten.end());

	if (ParsedAttr.Architecture != "" &&
	Target->isValidCPUName(ParsedAttr.Architecture))
	TargetCPU = ParsedAttr.Architecture;

	// Now populate the feature map, first with the TargetCPU which is either
	// the default or a new one from the target attribute string. Then we'll use
	// the passed in features (FeaturesAsWritten) along with the new ones from
	// the attribute.
	Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU,
	ParsedAttr.Features);
	} else if (const auto *SD = FD->getAttr<CPUSpecificAttr>()) {
	llvm::SmallVector<StringRef, 32> FeaturesTmp;
	Target->getCPUSpecificCPUDispatchFeatures(
	SD->getCPUName(GD.getMultiVersionIndex())->getName(), FeaturesTmp);
	std::vector<std::string> Features(FeaturesTmp.begin(), FeaturesTmp.end());
	Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
	} else {
	- Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU,
	- Target->getTargetOpts().Features);
	+ FeatureMap = Target->getTargetOpts().FeatureMap;
	}
	}

	OMPTraitInfo &ASTContext::getNewOMPTraitInfo() {
	OMPTraitInfoVector.emplace_back(new OMPTraitInfo());
	return *OMPTraitInfoVector.back();
	}

	const DiagnosticBuilder &
	clang::operator<<(const DiagnosticBuilder &DB,
	const ASTContext::SectionInfo &Section) {
	if (Section.Decl)
	return DB << Section.Decl;
	return DB << "a prior #pragma section";
	}
	diff --git a/contrib/llvm-project/clang/lib/Basic/Targets.cpp b/contrib/llvm-project/clang/lib/Basic/Targets.cpp
	index 6bbcafa27dfe..818133f66f3f 100644
	--- a/contrib/llvm-project/clang/lib/Basic/Targets.cpp
	+++ b/contrib/llvm-project/clang/lib/Basic/Targets.cpp
	@@ -1,691 +1,696 @@
	//===--- Targets.cpp - Implement target feature support -------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements construction of a TargetInfo object from a
	// target triple.
	//
	//===----------------------------------------------------------------------===//

	#include "Targets.h"

	#include "Targets/AArch64.h"
	#include "Targets/AMDGPU.h"
	#include "Targets/ARC.h"
	#include "Targets/ARM.h"
	#include "Targets/AVR.h"
	#include "Targets/BPF.h"
	#include "Targets/Hexagon.h"
	#include "Targets/Lanai.h"
	#include "Targets/Le64.h"
	#include "Targets/MSP430.h"
	#include "Targets/Mips.h"
	#include "Targets/NVPTX.h"
	#include "Targets/OSTargets.h"
	#include "Targets/PNaCl.h"
	#include "Targets/PPC.h"
	#include "Targets/RISCV.h"
	#include "Targets/SPIR.h"
	#include "Targets/Sparc.h"
	#include "Targets/SystemZ.h"
	#include "Targets/TCE.h"
	#include "Targets/VE.h"
	#include "Targets/WebAssembly.h"
	#include "Targets/X86.h"
	#include "Targets/XCore.h"
	#include "clang/Basic/Diagnostic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/Triple.h"

	using namespace clang;

	namespace clang {
	namespace targets {
	//===----------------------------------------------------------------------===//
	// Common code shared among targets.
	//===----------------------------------------------------------------------===//

	/// DefineStd - Define a macro name and standard variants. For example if
	/// MacroName is "unix", then this will define "__unix", "__unix__", and "unix"
	/// when in GNU mode.
	void DefineStd(MacroBuilder &Builder, StringRef MacroName,
	const LangOptions &Opts) {
	assert(MacroName[0] != '_' && "Identifier should be in the user's namespace");

	// If in GNU mode (e.g. -std=gnu99 but not -std=c99) define the raw identifier
	// in the user's namespace.
	if (Opts.GNUMode)
	Builder.defineMacro(MacroName);

	// Define __unix.
	Builder.defineMacro("__" + MacroName);

	// Define __unix__.
	Builder.defineMacro("__" + MacroName + "__");
	}

	void defineCPUMacros(MacroBuilder &Builder, StringRef CPUName, bool Tuning) {
	Builder.defineMacro("__" + CPUName);
	Builder.defineMacro("__" + CPUName + "__");
	if (Tuning)
	Builder.defineMacro("__tune_" + CPUName + "__");
	}

	void addCygMingDefines(const LangOptions &Opts, MacroBuilder &Builder) {
	// Mingw and cygwin define __declspec(a) to __attribute__((a)). Clang
	// supports __declspec natively under -fms-extensions, but we define a no-op
	// __declspec macro anyway for pre-processor compatibility.
	if (Opts.MicrosoftExt)
	Builder.defineMacro("__declspec", "__declspec");
	else
	Builder.defineMacro("__declspec(a)", "__attribute__((a))");

	if (!Opts.MicrosoftExt) {
	// Provide macros for all the calling convention keywords. Provide both
	// single and double underscore prefixed variants. These are available on
	// x64 as well as x86, even though they have no effect.
	const char *CCs[] = {"cdecl", "stdcall", "fastcall", "thiscall", "pascal"};
	for (const char *CC : CCs) {
	std::string GCCSpelling = "__attribute__((__";
	GCCSpelling += CC;
	GCCSpelling += "__))";
	Builder.defineMacro(Twine("_") + CC, GCCSpelling);
	Builder.defineMacro(Twine("__") + CC, GCCSpelling);
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// Driver code
	//===----------------------------------------------------------------------===//

	TargetInfo *AllocateTarget(const llvm::Triple &Triple,
	const TargetOptions &Opts) {
	llvm::Triple::OSType os = Triple.getOS();

	switch (Triple.getArch()) {
	default:
	return nullptr;

	case llvm::Triple::arc:
	return new ARCTargetInfo(Triple, Opts);

	case llvm::Triple::xcore:
	return new XCoreTargetInfo(Triple, Opts);

	case llvm::Triple::hexagon:
	if (os == llvm::Triple::Linux &&
	Triple.getEnvironment() == llvm::Triple::Musl)
	return new LinuxTargetInfo<HexagonTargetInfo>(Triple, Opts);
	return new HexagonTargetInfo(Triple, Opts);

	case llvm::Triple::lanai:
	return new LanaiTargetInfo(Triple, Opts);

	case llvm::Triple::aarch64_32:
	if (Triple.isOSDarwin())
	return new DarwinAArch64TargetInfo(Triple, Opts);

	return nullptr;
	case llvm::Triple::aarch64:
	if (Triple.isOSDarwin())
	return new DarwinAArch64TargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::Win32:
	switch (Triple.getEnvironment()) {
	case llvm::Triple::GNU:
	return new MinGWARM64TargetInfo(Triple, Opts);
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftARM64TargetInfo(Triple, Opts);
	}
	default:
	return new AArch64leTargetInfo(Triple, Opts);
	}

	case llvm::Triple::aarch64_be:
	switch (os) {
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	default:
	return new AArch64beTargetInfo(Triple, Opts);
	}

	case llvm::Triple::arm:
	case llvm::Triple::thumb:
	if (Triple.isOSBinFormatMachO())
	return new DarwinARMTargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::Win32:
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Cygnus:
	return new CygwinARMTargetInfo(Triple, Opts);
	case llvm::Triple::GNU:
	return new MinGWARMTargetInfo(Triple, Opts);
	case llvm::Triple::Itanium:
	return new ItaniumWindowsARMleTargetInfo(Triple, Opts);
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftARMleTargetInfo(Triple, Opts);
	}
	default:
	return new ARMleTargetInfo(Triple, Opts);
	}

	case llvm::Triple::armeb:
	case llvm::Triple::thumbeb:
	if (Triple.isOSDarwin())
	return new DarwinARMTargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	default:
	return new ARMbeTargetInfo(Triple, Opts);
	}

	case llvm::Triple::avr:
	return new AVRTargetInfo(Triple, Opts);
	case llvm::Triple::bpfeb:
	case llvm::Triple::bpfel:
	return new BPFTargetInfo(Triple, Opts);

	case llvm::Triple::msp430:
	return new MSP430TargetInfo(Triple, Opts);

	case llvm::Triple::mips:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::mipsel:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<NaClMips32TargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::mips64:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::mips64el:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::le32:
	switch (os) {
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<PNaClTargetInfo>(Triple, Opts);
	default:
	return nullptr;
	}

	case llvm::Triple::le64:
	return new Le64TargetInfo(Triple, Opts);

	case llvm::Triple::ppc:
	if (Triple.isOSDarwin())
	return new DarwinPPC32TargetInfo(Triple, Opts);
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::AIX:
	return new AIXPPC32TargetInfo(Triple, Opts);
	default:
	return new PPC32TargetInfo(Triple, Opts);
	}

	case llvm::Triple::ppc64:
	if (Triple.isOSDarwin())
	return new DarwinPPC64TargetInfo(Triple, Opts);
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::Lv2:
	return new PS3PPUTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	+ case llvm::Triple::OpenBSD:
	+ return new OpenBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::AIX:
	return new AIXPPC64TargetInfo(Triple, Opts);
	default:
	return new PPC64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::ppc64le:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	+ case llvm::Triple::OpenBSD:
	+ return new OpenBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	default:
	return new PPC64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::nvptx:
	return new NVPTXTargetInfo(Triple, Opts, /TargetPointerWidth=/32);
	case llvm::Triple::nvptx64:
	return new NVPTXTargetInfo(Triple, Opts, /TargetPointerWidth=/64);

	case llvm::Triple::amdgcn:
	case llvm::Triple::r600:
	return new AMDGPUTargetInfo(Triple, Opts);

	case llvm::Triple::riscv32:
	// TODO: add cases for NetBSD, RTEMS once tested.
	switch (os) {
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<RISCV32TargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<RISCV32TargetInfo>(Triple, Opts);
	default:
	return new RISCV32TargetInfo(Triple, Opts);
	}

	case llvm::Triple::riscv64:
	// TODO: add cases for NetBSD, RTEMS once tested.
	switch (os) {
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<RISCV64TargetInfo>(Triple, Opts);
	+ case llvm::Triple::OpenBSD:
	+ return new OpenBSDTargetInfo<RISCV64TargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<RISCV64TargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<RISCV64TargetInfo>(Triple, Opts);
	default:
	return new RISCV64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::sparc:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	default:
	return new SparcV8TargetInfo(Triple, Opts);
	}

	// The 'sparcel' architecture copies all the above cases except for Solaris.
	case llvm::Triple::sparcel:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	default:
	return new SparcV8elTargetInfo(Triple, Opts);
	}

	case llvm::Triple::sparcv9:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	default:
	return new SparcV9TargetInfo(Triple, Opts);
	}

	case llvm::Triple::systemz:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SystemZTargetInfo>(Triple, Opts);
	default:
	return new SystemZTargetInfo(Triple, Opts);
	}

	case llvm::Triple::tce:
	return new TCETargetInfo(Triple, Opts);

	case llvm::Triple::tcele:
	return new TCELETargetInfo(Triple, Opts);

	case llvm::Triple::x86:
	if (Triple.isOSDarwin())
	return new DarwinI386TargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::Ananas:
	return new AnanasTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Linux: {
	switch (Triple.getEnvironment()) {
	default:
	return new LinuxTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Android:
	return new AndroidX86_32TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::DragonFly:
	return new DragonFlyBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDI386TargetInfo(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDI386TargetInfo(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::KFreeBSD:
	return new KFreeBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Minix:
	return new MinixTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Win32: {
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Cygnus:
	return new CygwinX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::GNU:
	return new MinGWX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::Itanium:
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftX86_32TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::Haiku:
	return new HaikuX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::ELFIAMCU:
	return new MCUX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::Hurd:
	return new HurdTargetInfo<X86_32TargetInfo>(Triple, Opts);
	default:
	return new X86_32TargetInfo(Triple, Opts);
	}

	case llvm::Triple::x86_64:
	if (Triple.isOSDarwin() \|\| Triple.isOSBinFormatMachO())
	return new DarwinX86_64TargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::Ananas:
	return new AnanasTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Linux: {
	switch (Triple.getEnvironment()) {
	default:
	return new LinuxTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Android:
	return new AndroidX86_64TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::DragonFly:
	return new DragonFlyBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::KFreeBSD:
	return new KFreeBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Win32: {
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Cygnus:
	return new CygwinX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::GNU:
	return new MinGWX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftX86_64TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::Haiku:
	return new HaikuTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::PS4:
	return new PS4OSTargetInfo<X86_64TargetInfo>(Triple, Opts);
	default:
	return new X86_64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::spir: {
	if (Triple.getOS() != llvm::Triple::UnknownOS \|\|
	Triple.getEnvironment() != llvm::Triple::UnknownEnvironment)
	return nullptr;
	return new SPIR32TargetInfo(Triple, Opts);
	}
	case llvm::Triple::spir64: {
	if (Triple.getOS() != llvm::Triple::UnknownOS \|\|
	Triple.getEnvironment() != llvm::Triple::UnknownEnvironment)
	return nullptr;
	return new SPIR64TargetInfo(Triple, Opts);
	}
	case llvm::Triple::wasm32:
	if (Triple.getSubArch() != llvm::Triple::NoSubArch \|\|
	Triple.getVendor() != llvm::Triple::UnknownVendor \|\|
	!Triple.isOSBinFormatWasm())
	return nullptr;
	switch (Triple.getOS()) {
	case llvm::Triple::WASI:
	return new WASITargetInfo<WebAssembly32TargetInfo>(Triple, Opts);
	case llvm::Triple::Emscripten:
	return new EmscriptenTargetInfo<WebAssembly32TargetInfo>(Triple, Opts);
	case llvm::Triple::UnknownOS:
	return new WebAssemblyOSTargetInfo<WebAssembly32TargetInfo>(Triple, Opts);
	default:
	return nullptr;
	}
	case llvm::Triple::wasm64:
	if (Triple.getSubArch() != llvm::Triple::NoSubArch \|\|
	Triple.getVendor() != llvm::Triple::UnknownVendor \|\|
	!Triple.isOSBinFormatWasm())
	return nullptr;
	switch (Triple.getOS()) {
	case llvm::Triple::WASI:
	return new WASITargetInfo<WebAssembly64TargetInfo>(Triple, Opts);
	case llvm::Triple::Emscripten:
	return new EmscriptenTargetInfo<WebAssembly64TargetInfo>(Triple, Opts);
	case llvm::Triple::UnknownOS:
	return new WebAssemblyOSTargetInfo<WebAssembly64TargetInfo>(Triple, Opts);
	default:
	return nullptr;
	}

	case llvm::Triple::renderscript32:
	return new LinuxTargetInfo<RenderScript32TargetInfo>(Triple, Opts);
	case llvm::Triple::renderscript64:
	return new LinuxTargetInfo<RenderScript64TargetInfo>(Triple, Opts);

	case llvm::Triple::ve:
	return new LinuxTargetInfo<VETargetInfo>(Triple, Opts);
	}
	}
	} // namespace targets
	} // namespace clang

	using namespace clang::targets;
	/// CreateTargetInfo - Return the target info object for the specified target
	/// options.
	TargetInfo *
	TargetInfo::CreateTargetInfo(DiagnosticsEngine &Diags,
	const std::shared_ptr<TargetOptions> &Opts) {
	llvm::Triple Triple(Opts->Triple);

	// Construct the target
	std::unique_ptr<TargetInfo> Target(AllocateTarget(Triple, *Opts));
	if (!Target) {
	Diags.Report(diag::err_target_unknown_triple) << Triple.str();
	return nullptr;
	}
	Target->TargetOpts = Opts;

	// Set the target CPU if specified.
	if (!Opts->CPU.empty() && !Target->setCPU(Opts->CPU)) {
	Diags.Report(diag::err_target_unknown_cpu) << Opts->CPU;
	SmallVector<StringRef, 32> ValidList;
	Target->fillValidCPUList(ValidList);
	if (!ValidList.empty())
	Diags.Report(diag::note_valid_options) << llvm::join(ValidList, ", ");
	return nullptr;
	}

	// Set the target ABI if specified.
	if (!Opts->ABI.empty() && !Target->setABI(Opts->ABI)) {
	Diags.Report(diag::err_target_unknown_abi) << Opts->ABI;
	return nullptr;
	}

	// Set the fp math unit.
	if (!Opts->FPMath.empty() && !Target->setFPMath(Opts->FPMath)) {
	Diags.Report(diag::err_target_unknown_fpmath) << Opts->FPMath;
	return nullptr;
	}

	// Compute the default target features, we need the target to handle this
	// because features may have dependencies on one another.
	- llvm::StringMap<bool> Features;
	- if (!Target->initFeatureMap(Features, Diags, Opts->CPU,
	+ if (!Target->initFeatureMap(Opts->FeatureMap, Diags, Opts->CPU,
	Opts->FeaturesAsWritten))
	return nullptr;

	// Add the features to the compile options.
	Opts->Features.clear();
	- for (const auto &F : Features)
	+ for (const auto &F : Opts->FeatureMap)
	Opts->Features.push_back((F.getValue() ? "+" : "-") + F.getKey().str());
	// Sort here, so we handle the features in a predictable order. (This matters
	// when we're dealing with features that overlap.)
	llvm::sort(Opts->Features);

	if (!Target->handleTargetFeatures(Opts->Features, Diags))
	return nullptr;

	Target->setSupportedOpenCLOpts();
	Target->setOpenCLExtensionOpts();
	Target->setMaxAtomicWidth();

	if (!Target->validateTarget(Diags))
	return nullptr;

	Target->CheckFixedPointBits();

	return Target.release();
	}
	diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h b/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h
	index cfa362bef1b1..2a9e4f91d478 100644
	--- a/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h
	+++ b/contrib/llvm-project/clang/lib/Basic/Targets/OSTargets.h
	@@ -1,877 +1,882 @@
	//===--- OSTargets.h - Declare OS target feature support --------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares OS specific TargetInfo types.
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_OSTARGETS_H
	#define LLVM_CLANG_LIB_BASIC_TARGETS_OSTARGETS_H

	#include "Targets.h"
	#include "llvm/MC/MCSectionMachO.h"

	namespace clang {
	namespace targets {

	template <typename TgtInfo>
	class LLVM_LIBRARY_VISIBILITY OSTargetInfo : public TgtInfo {
	protected:
	virtual void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const = 0;

	public:
	OSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TgtInfo(Triple, Opts) {}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	TgtInfo::getTargetDefines(Opts, Builder);
	getOSDefines(Opts, TgtInfo::getTriple(), Builder);
	}
	};

	// CloudABI Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY CloudABITargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__CloudABI__");
	Builder.defineMacro("__ELF__");

	// CloudABI uses ISO/IEC 10646:2012 for wchar_t, char16_t and char32_t.
	Builder.defineMacro("__STDC_ISO_10646__", "201206L");
	Builder.defineMacro("__STDC_UTF_16__");
	Builder.defineMacro("__STDC_UTF_32__");
	}

	public:
	CloudABITargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Ananas target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY AnanasTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Ananas defines
	Builder.defineMacro("__Ananas__");
	Builder.defineMacro("__ELF__");
	}

	public:
	AnanasTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
	const llvm::Triple &Triple, StringRef &PlatformName,
	VersionTuple &PlatformMinVersion);

	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY DarwinTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	getDarwinDefines(Builder, Opts, Triple, this->PlatformName,
	this->PlatformMinVersion);
	}

	public:
	DarwinTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	// By default, no TLS, and we list permitted architecture/OS
	// combinations.
	this->TLSSupported = false;

	if (Triple.isMacOSX())
	this->TLSSupported = !Triple.isMacOSXVersionLT(10, 7);
	else if (Triple.isiOS()) {
	// 64-bit iOS supported it from 8 onwards, 32-bit device from 9 onwards,
	// 32-bit simulator from 10 onwards.
	if (Triple.isArch64Bit())
	this->TLSSupported = !Triple.isOSVersionLT(8);
	else if (Triple.isArch32Bit()) {
	if (!Triple.isSimulatorEnvironment())
	this->TLSSupported = !Triple.isOSVersionLT(9);
	else
	this->TLSSupported = !Triple.isOSVersionLT(10);
	}
	} else if (Triple.isWatchOS()) {
	if (!Triple.isSimulatorEnvironment())
	this->TLSSupported = !Triple.isOSVersionLT(2);
	else
	this->TLSSupported = !Triple.isOSVersionLT(3);
	}

	this->MCountName = "\01mcount";
	}

	std::string isValidSectionSpecifier(StringRef SR) const override {
	// Let MCSectionMachO validate this.
	StringRef Segment, Section;
	unsigned TAA, StubSize;
	bool HasTAA;
	return llvm::MCSectionMachO::ParseSectionSpecifier(SR, Segment, Section,
	TAA, HasTAA, StubSize);
	}

	const char *getStaticInitSectionSpecifier() const override {
	// FIXME: We should return 0 when building kexts.
	return "__TEXT,__StaticInit,regular,pure_instructions";
	}

	/// Darwin does not support protected visibility. Darwin's "default"
	/// is very similar to ELF's "protected"; Darwin requires a "weak"
	/// attribute on declarations that can be dynamically replaced.
	bool hasProtectedVisibility() const override { return false; }

	unsigned getExnObjectAlignment() const override {
	// Older versions of libc++abi guarantee an alignment of only 8-bytes for
	// exception objects because of a bug in __cxa_exception that was
	// eventually fixed in r319123.
	llvm::VersionTuple MinVersion;
	const llvm::Triple &T = this->getTriple();

	// Compute the earliest OS versions that have the fix to libc++abi.
	switch (T.getOS()) {
	case llvm::Triple::Darwin:
	case llvm::Triple::MacOSX: // Earliest supporting version is 10.14.
	MinVersion = llvm::VersionTuple(10U, 14U);
	break;
	case llvm::Triple::IOS:
	case llvm::Triple::TvOS: // Earliest supporting version is 12.0.0.
	MinVersion = llvm::VersionTuple(12U);
	break;
	case llvm::Triple::WatchOS: // Earliest supporting version is 5.0.0.
	MinVersion = llvm::VersionTuple(5U);
	break;
	default:
	llvm_unreachable("Unexpected OS");
	}

	unsigned Major, Minor, Micro;
	T.getOSVersion(Major, Minor, Micro);
	if (llvm::VersionTuple(Major, Minor, Micro) < MinVersion)
	return 64;
	return OSTargetInfo<Target>::getExnObjectAlignment();
	}

	TargetInfo::IntType getLeastIntTypeByWidth(unsigned BitWidth,
	bool IsSigned) const final {
	// Darwin uses `long long` for `int_least64_t` and `int_fast64_t`.
	return BitWidth == 64
	? (IsSigned ? TargetInfo::SignedLongLong
	: TargetInfo::UnsignedLongLong)
	: TargetInfo::getLeastIntTypeByWidth(BitWidth, IsSigned);
	}
	};

	// DragonFlyBSD Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY DragonFlyBSDTargetInfo
	: public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// DragonFly defines; list based off of gcc output
	Builder.defineMacro("__DragonFly__");
	Builder.defineMacro("__DragonFly_cc_version", "100001");
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
	Builder.defineMacro("__tune_i386__");
	DefineStd(Builder, "unix", Opts);
	}

	public:
	DragonFlyBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->MCountName = ".mcount";
	break;
	}
	}
	};

	#ifndef FREEBSD_CC_VERSION
	#define FREEBSD_CC_VERSION 0U
	#endif

	// FreeBSD Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY FreeBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// FreeBSD defines; list based off of gcc output

	unsigned Release = Triple.getOSMajorVersion();
	if (Release == 0U)
	Release = 8U;
	unsigned CCVersion = FREEBSD_CC_VERSION;
	if (CCVersion == 0U)
	CCVersion = Release * 100000U + 1U;

	Builder.defineMacro("__FreeBSD__", Twine(Release));
	Builder.defineMacro("__FreeBSD_cc_version", Twine(CCVersion));
	Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");

	// On FreeBSD, wchar_t contains the number of the code point as
	// used by the character set of the locale. These character sets are
	// not necessarily a superset of ASCII.
	//
	// FIXME: This is wrong; the macro refers to the numerical values
	// of wchar_t literals, which are not locale-dependent. However,
	// FreeBSD systems apparently depend on us getting this wrong, and
	// setting this to 1 is conforming even if all the basic source
	// character literals have the same encoding as char and wchar_t.
	Builder.defineMacro("__STDC_MB_MIGHT_NEQ_WC__", "1");
	}

	public:
	FreeBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->MCountName = ".mcount";
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	this->MCountName = "_mcount";
	break;
	case llvm::Triple::arm:
	this->MCountName = "__mcount";
	break;
	}
	}
	};

	// GNU/kFreeBSD Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY KFreeBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// GNU/kFreeBSD defines; list based off of gcc output

	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__FreeBSD_kernel__");
	Builder.defineMacro("__GLIBC__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}

	public:
	KFreeBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Haiku Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY HaikuTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Haiku defines; list based off of gcc output
	Builder.defineMacro("__HAIKU__");
	Builder.defineMacro("__ELF__");
	DefineStd(Builder, "unix", Opts);
	if (this->HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	}

	public:
	HaikuTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->SizeType = TargetInfo::UnsignedLong;
	this->IntPtrType = TargetInfo::SignedLong;
	this->PtrDiffType = TargetInfo::SignedLong;
	this->ProcessIDType = TargetInfo::SignedLong;
	this->TLSSupported = false;
	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->HasFloat128 = true;
	break;
	}
	}
	};

	// Hurd target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY HurdTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Hurd defines; list based off of gcc output.
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__GNU__");
	Builder.defineMacro("__gnu_hurd__");
	Builder.defineMacro("__MACH__");
	Builder.defineMacro("__GLIBC__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	public:
	HurdTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Minix Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY MinixTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Minix defines

	Builder.defineMacro("__minix", "3");
	Builder.defineMacro("_EM_WSIZE", "4");
	Builder.defineMacro("_EM_PSIZE", "4");
	Builder.defineMacro("_EM_SSIZE", "2");
	Builder.defineMacro("_EM_LSIZE", "4");
	Builder.defineMacro("_EM_FSIZE", "4");
	Builder.defineMacro("_EM_DSIZE", "8");
	Builder.defineMacro("__ELF__");
	DefineStd(Builder, "unix", Opts);
	}

	public:
	MinixTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Linux target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY LinuxTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Linux defines; list based off of gcc output
	DefineStd(Builder, "unix", Opts);
	DefineStd(Builder, "linux", Opts);
	Builder.defineMacro("__ELF__");
	if (Triple.isAndroid()) {
	Builder.defineMacro("__ANDROID__", "1");
	unsigned Maj, Min, Rev;
	Triple.getEnvironmentVersion(Maj, Min, Rev);
	this->PlatformName = "android";
	this->PlatformMinVersion = VersionTuple(Maj, Min, Rev);
	if (Maj)
	Builder.defineMacro("__ANDROID_API__", Twine(Maj));
	} else {
	Builder.defineMacro("__gnu_linux__");
	}
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	if (this->HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	}

	public:
	LinuxTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->WIntType = TargetInfo::UnsignedInt;

	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	this->MCountName = "_mcount";
	break;
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->HasFloat128 = true;
	break;
	}
	}

	const char *getStaticInitSectionSpecifier() const override {
	return ".text.startup";
	}
	};

	// NetBSD Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY NetBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// NetBSD defines; list based off of gcc output
	Builder.defineMacro("__NetBSD__");
	Builder.defineMacro("__unix__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	}

	public:
	NetBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "__mcount";
	}
	};

	// OpenBSD Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// OpenBSD defines; list based off of gcc output

	Builder.defineMacro("__OpenBSD__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (this->HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	}

	public:
	OpenBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	+ this->WCharType = this->WIntType = this->SignedInt;
	+ this->IntMaxType = TargetInfo::SignedLongLong;
	+ this->Int64Type = TargetInfo::SignedLongLong;
	switch (Triple.getArch()) {
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->HasFloat128 = true;
	LLVM_FALLTHROUGH;
	default:
	this->MCountName = "__mcount";
	break;
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	case llvm::Triple::ppc:
	+ case llvm::Triple::ppc64:
	+ case llvm::Triple::ppc64le:
	case llvm::Triple::sparcv9:
	this->MCountName = "_mcount";
	break;
	}
	}
	};

	// PSP Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY PSPTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// PSP defines; list based on the output of the pspdev gcc toolchain.
	Builder.defineMacro("PSP");
	Builder.defineMacro("_PSP");
	Builder.defineMacro("__psp__");
	Builder.defineMacro("__ELF__");
	}

	public:
	PSPTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {}
	};

	// PS3 PPU Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY PS3PPUTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// PS3 PPU defines.
	Builder.defineMacro("__PPC__");
	Builder.defineMacro("__PPU__");
	Builder.defineMacro("__CELLOS_LV2__");
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__LP32__");
	Builder.defineMacro("_ARCH_PPC64");
	Builder.defineMacro("__powerpc64__");
	}

	public:
	PS3PPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->LongWidth = this->LongAlign = 32;
	this->PointerWidth = this->PointerAlign = 32;
	this->IntMaxType = TargetInfo::SignedLongLong;
	this->Int64Type = TargetInfo::SignedLongLong;
	this->SizeType = TargetInfo::UnsignedInt;
	this->resetDataLayout("E-m:e-p:32:32-i64:64-n32:64");
	}
	};

	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY PS4OSTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__FreeBSD__", "9");
	Builder.defineMacro("__FreeBSD_cc_version", "900001");
	Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__SCE__");
	Builder.defineMacro("__ORBIS__");
	}

	public:
	PS4OSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->WCharType = TargetInfo::UnsignedShort;

	// On PS4, TLS variable cannot be aligned to more than 32 bytes (256 bits).
	this->MaxTLSAlign = 256;

	// On PS4, do not honor explicit bit field alignment,
	// as in "__attribute__((aligned(2))) int b : 1;".
	this->UseExplicitBitFieldAlignment = false;

	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86_64:
	this->MCountName = ".mcount";
	this->NewAlign = 256;
	break;
	}
	}
	TargetInfo::CallingConvCheckResult
	checkCallingConvention(CallingConv CC) const override {
	return (CC == CC_C) ? TargetInfo::CCCR_OK : TargetInfo::CCCR_Error;
	}
	};

	// RTEMS Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY RTEMSTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// RTEMS defines; list based off of gcc output

	Builder.defineMacro("__rtems__");
	Builder.defineMacro("__ELF__");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}

	public:
	RTEMSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86:
	// this->MCountName = ".mcount";
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	// this->MCountName = "_mcount";
	break;
	case llvm::Triple::arm:
	// this->MCountName = "__mcount";
	break;
	}
	}
	};

	// Solaris target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY SolarisTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "sun", Opts);
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__svr4__");
	Builder.defineMacro("__SVR4");
	// Solaris headers require _XOPEN_SOURCE to be set to 600 for C99 and
	// newer, but to 500 for everything else. feature_test.h has a check to
	// ensure that you are not using C99 with an old version of X/Open or C89
	// with a new version.
	if (Opts.C99)
	Builder.defineMacro("_XOPEN_SOURCE", "600");
	else
	Builder.defineMacro("_XOPEN_SOURCE", "500");
	if (Opts.CPlusPlus) {
	Builder.defineMacro("__C99FEATURES__");
	Builder.defineMacro("_FILE_OFFSET_BITS", "64");
	}
	// GCC restricts the next two to C++.
	Builder.defineMacro("_LARGEFILE_SOURCE");
	Builder.defineMacro("_LARGEFILE64_SOURCE");
	Builder.defineMacro("__EXTENSIONS__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (this->HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	}

	public:
	SolarisTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	if (this->PointerWidth == 64) {
	this->WCharType = this->WIntType = this->SignedInt;
	} else {
	this->WCharType = this->WIntType = this->SignedLong;
	}
	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->HasFloat128 = true;
	break;
	}
	}
	};

	// AIX Target
	template <typename Target>
	class AIXTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("_IBMR2");
	Builder.defineMacro("_POWER");

	Builder.defineMacro("_AIX");

	unsigned Major, Minor, Micro;
	Triple.getOSVersion(Major, Minor, Micro);

	// Define AIX OS-Version Macros.
	// Includes logic for legacy versions of AIX; no specific intent to support.
	std::pair<int, int> OsVersion = {Major, Minor};
	if (OsVersion >= std::make_pair(3, 2)) Builder.defineMacro("_AIX32");
	if (OsVersion >= std::make_pair(4, 1)) Builder.defineMacro("_AIX41");
	if (OsVersion >= std::make_pair(4, 3)) Builder.defineMacro("_AIX43");
	if (OsVersion >= std::make_pair(5, 0)) Builder.defineMacro("_AIX50");
	if (OsVersion >= std::make_pair(5, 1)) Builder.defineMacro("_AIX51");
	if (OsVersion >= std::make_pair(5, 2)) Builder.defineMacro("_AIX52");
	if (OsVersion >= std::make_pair(5, 3)) Builder.defineMacro("_AIX53");
	if (OsVersion >= std::make_pair(6, 1)) Builder.defineMacro("_AIX61");
	if (OsVersion >= std::make_pair(7, 1)) Builder.defineMacro("_AIX71");
	if (OsVersion >= std::make_pair(7, 2)) Builder.defineMacro("_AIX72");

	// FIXME: Do not define _LONG_LONG when -fno-long-long is specified.
	Builder.defineMacro("_LONG_LONG");

	if (Opts.POSIXThreads) {
	Builder.defineMacro("_THREAD_SAFE");
	}

	if (this->PointerWidth == 64) {
	Builder.defineMacro("__64BIT__");
	}

	// Define _WCHAR_T when it is a fundamental type
	// (i.e., for C++ without -fno-wchar).
	if (Opts.CPlusPlus && Opts.WChar) {
	Builder.defineMacro("_WCHAR_T");
	}
	}

	public:
	AIXTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->TheCXXABI.set(TargetCXXABI::XL);

	if (this->PointerWidth == 64) {
	this->WCharType = this->UnsignedInt;
	} else {
	this->WCharType = this->UnsignedShort;
	}
	this->UseZeroLengthBitfieldAlignment = true;
	}

	// AIX sets FLT_EVAL_METHOD to be 1.
	unsigned getFloatEvalMethod() const override { return 1; }
	bool hasInt128Type() const override { return false; }
	};

	void addWindowsDefines(const llvm::Triple &Triple, const LangOptions &Opts,
	MacroBuilder &Builder);

	// Windows target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY WindowsTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	addWindowsDefines(Triple, Opts, Builder);
	}

	public:
	WindowsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->WCharType = TargetInfo::UnsignedShort;
	this->WIntType = TargetInfo::UnsignedShort;
	}
	};

	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY NaClTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");

	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__native_client__");
	}

	public:
	NaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->LongAlign = 32;
	this->LongWidth = 32;
	this->PointerAlign = 32;
	this->PointerWidth = 32;
	this->IntMaxType = TargetInfo::SignedLongLong;
	this->Int64Type = TargetInfo::SignedLongLong;
	this->DoubleAlign = 64;
	this->LongDoubleWidth = 64;
	this->LongDoubleAlign = 64;
	this->LongLongWidth = 64;
	this->LongLongAlign = 64;
	this->SizeType = TargetInfo::UnsignedInt;
	this->PtrDiffType = TargetInfo::SignedInt;
	this->IntPtrType = TargetInfo::SignedInt;
	// RegParmMax is inherited from the underlying architecture.
	this->LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	if (Triple.getArch() == llvm::Triple::arm) {
	// Handled in ARM's setABI().
	} else if (Triple.getArch() == llvm::Triple::x86) {
	this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
	"i64:64-n8:16:32-S128");
	} else if (Triple.getArch() == llvm::Triple::x86_64) {
	this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
	"i64:64-n8:16:32:64-S128");
	} else if (Triple.getArch() == llvm::Triple::mipsel) {
	// Handled on mips' setDataLayout.
	} else {
	assert(Triple.getArch() == llvm::Triple::le32);
	this->resetDataLayout("e-p:32:32-i64:64");
	}
	}
	};

	// Fuchsia Target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY FuchsiaTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__Fuchsia__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	// Required by the libc++ locale support.
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}

	public:
	FuchsiaTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "__mcount";
	this->TheCXXABI.set(TargetCXXABI::Fuchsia);
	}
	};

	// WebAssembly target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY WebAssemblyOSTargetInfo
	: public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// A common platform macro.
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	// Follow g++ convention and predefine _GNU_SOURCE for C++.
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	// Indicate that we have __float128.
	Builder.defineMacro("__FLOAT128__");
	}

	public:
	explicit WebAssemblyOSTargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "__mcount";
	this->TheCXXABI.set(TargetCXXABI::WebAssembly);
	this->HasFloat128 = true;
	}
	};

	// WASI target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY WASITargetInfo
	: public WebAssemblyOSTargetInfo<Target> {
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const final {
	WebAssemblyOSTargetInfo<Target>::getOSDefines(Opts, Triple, Builder);
	Builder.defineMacro("__wasi__");
	}

	public:
	explicit WASITargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WebAssemblyOSTargetInfo<Target>(Triple, Opts) {}
	};

	// Emscripten target
	template <typename Target>
	class LLVM_LIBRARY_VISIBILITY EmscriptenTargetInfo
	: public WebAssemblyOSTargetInfo<Target> {
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const final {
	WebAssemblyOSTargetInfo<Target>::getOSDefines(Opts, Triple, Builder);
	Builder.defineMacro("__EMSCRIPTEN__");
	}

	public:
	explicit EmscriptenTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WebAssemblyOSTargetInfo<Target>(Triple, Opts) {}
	};

	} // namespace targets
	} // namespace clang
	#endif // LLVM_CLANG_LIB_BASIC_TARGETS_OSTARGETS_H
	diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h b/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h
	index 858059bacb86..bda6cb7dc89f 100644
	--- a/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h
	+++ b/contrib/llvm-project/clang/lib/Basic/Targets/PPC.h
	@@ -1,494 +1,494 @@
	//===--- PPC.h - Declare PPC target feature support -------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares PPC TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_PPC_H
	#define LLVM_CLANG_LIB_BASIC_TARGETS_PPC_H

	#include "OSTargets.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TargetOptions.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/Compiler.h"

	namespace clang {
	namespace targets {

	// PPC abstract base class
	class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {

	/// Flags for architecture specific defines.
	typedef enum {
	ArchDefineNone = 0,
	ArchDefineName = 1 << 0, // <name> is substituted for arch name.
	ArchDefinePpcgr = 1 << 1,
	ArchDefinePpcsq = 1 << 2,
	ArchDefine440 = 1 << 3,
	ArchDefine603 = 1 << 4,
	ArchDefine604 = 1 << 5,
	ArchDefinePwr4 = 1 << 6,
	ArchDefinePwr5 = 1 << 7,
	ArchDefinePwr5x = 1 << 8,
	ArchDefinePwr6 = 1 << 9,
	ArchDefinePwr6x = 1 << 10,
	ArchDefinePwr7 = 1 << 11,
	ArchDefinePwr8 = 1 << 12,
	ArchDefinePwr9 = 1 << 13,
	ArchDefinePwr10 = 1 << 14,
	ArchDefineFuture = 1 << 15,
	ArchDefineA2 = 1 << 16,
	ArchDefineA2q = 1 << 17,
	ArchDefineE500 = 1 << 18
	} ArchDefineTypes;

	ArchDefineTypes ArchDefs = ArchDefineNone;
	static const Builtin::Info BuiltinInfo[];
	static const char *const GCCRegNames[];
	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	std::string CPU;
	enum PPCFloatABI { HardFloat, SoftFloat } FloatABI;

	// Target cpu features.
	bool HasAltivec = false;
	bool HasVSX = false;
	bool HasP8Vector = false;
	bool HasP8Crypto = false;
	bool HasDirectMove = false;
	bool HasQPX = false;
	bool HasHTM = false;
	bool HasBPERMD = false;
	bool HasExtDiv = false;
	bool HasP9Vector = false;
	bool HasSPE = false;
	bool HasP10Vector = false;
	bool HasPCRelativeMemops = false;

	protected:
	std::string ABI;

	public:
	PPCTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	SuitableAlign = 128;
	SimdDefaultAlign = 128;
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::PPCDoubleDouble();
	}

	// Set the language option for altivec based on our value.
	void adjust(LangOptions &Opts) override;

	// Note: GCC recognizes the following additional cpus:
	// 401, 403, 405, 405fp, 440fp, 464, 464fp, 476, 476fp, 505, 740, 801,
	// 821, 823, 8540, e300c2, e300c3, e500mc64, e6500, 860, cell, titan, rs64.
	bool isValidCPUName(StringRef Name) const override;
	void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;

	bool setCPU(const std::string &Name) override {
	bool CPUKnown = isValidCPUName(Name);
	if (CPUKnown) {
	CPU = Name;

	// CPU identification.
	ArchDefs =
	(ArchDefineTypes)llvm::StringSwitch<int>(CPU)
	.Case("440", ArchDefineName)
	.Case("450", ArchDefineName \| ArchDefine440)
	.Case("601", ArchDefineName)
	.Case("602", ArchDefineName \| ArchDefinePpcgr)
	.Case("603", ArchDefineName \| ArchDefinePpcgr)
	.Case("603e", ArchDefineName \| ArchDefine603 \| ArchDefinePpcgr)
	.Case("603ev", ArchDefineName \| ArchDefine603 \| ArchDefinePpcgr)
	.Case("604", ArchDefineName \| ArchDefinePpcgr)
	.Case("604e", ArchDefineName \| ArchDefine604 \| ArchDefinePpcgr)
	.Case("620", ArchDefineName \| ArchDefinePpcgr)
	.Case("630", ArchDefineName \| ArchDefinePpcgr)
	.Case("7400", ArchDefineName \| ArchDefinePpcgr)
	.Case("7450", ArchDefineName \| ArchDefinePpcgr)
	.Case("750", ArchDefineName \| ArchDefinePpcgr)
	.Case("970", ArchDefineName \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("a2", ArchDefineA2)
	.Case("a2q", ArchDefineName \| ArchDefineA2 \| ArchDefineA2q)
	.Cases("power3", "pwr3", ArchDefinePpcgr)
	.Cases("power4", "pwr4",
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power5", "pwr5",
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Cases("power5x", "pwr5x",
	ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power6", "pwr6",
	ArchDefinePwr6 \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power6x", "pwr6x",
	ArchDefinePwr6x \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Cases("power7", "pwr7",
	ArchDefinePwr7 \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	// powerpc64le automatically defaults to at least power8.
	.Cases("power8", "pwr8", "ppc64le",
	ArchDefinePwr8 \| ArchDefinePwr7 \| ArchDefinePwr6 \|
	ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power9", "pwr9",
	ArchDefinePwr9 \| ArchDefinePwr8 \| ArchDefinePwr7 \|
	ArchDefinePwr6 \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("power10", "pwr10",
	ArchDefinePwr10 \| ArchDefinePwr9 \| ArchDefinePwr8 \|
	ArchDefinePwr7 \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("future",
	ArchDefineFuture \| ArchDefinePwr10 \| ArchDefinePwr9 \|
	ArchDefinePwr8 \| ArchDefinePwr7 \| ArchDefinePwr6 \|
	ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Cases("8548", "e500", ArchDefineE500)
	.Default(ArchDefineNone);
	}
	return CPUKnown;
	}

	StringRef getABI() const override { return ABI; }

	ArrayRef<Builtin::Info> getTargetBuiltins() const override;

	bool isCLZForZeroUndef() const override { return false; }

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override;

	void addP10SpecificFeatures(llvm::StringMap<bool> &Features) const;
	void addFutureSpecificFeatures(llvm::StringMap<bool> &Features) const;

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;

	bool hasFeature(StringRef Feature) const override;

	void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
	bool Enabled) const override;

	ArrayRef<const char *> getGCCRegNames() const override;

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;

	ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default:
	return false;
	case 'O': // Zero
	break;
	case 'f': // Floating point register
	// Don't use floating point registers on soft float ABI.
	if (FloatABI == SoftFloat)
	return false;
	LLVM_FALLTHROUGH;
	case 'b': // Base register
	Info.setAllowsRegister();
	break;
	// FIXME: The following are added to allow parsing.
	// I just took a guess at what the actions should be.
	// Also, is more specific checking needed? I.e. specific registers?
	case 'd': // Floating point register (containing 64-bit value)
	case 'v': // Altivec vector register
	// Don't use floating point and altivec vector registers
	// on soft float ABI
	if (FloatABI == SoftFloat)
	return false;
	Info.setAllowsRegister();
	break;
	case 'w':
	switch (Name[1]) {
	case 'd': // VSX vector register to hold vector double data
	case 'f': // VSX vector register to hold vector float data
	case 's': // VSX vector register to hold scalar double data
	case 'w': // VSX vector register to hold scalar double data
	case 'a': // Any VSX register
	case 'c': // An individual CR bit
	case 'i': // FP or VSX register to hold 64-bit integers data
	break;
	default:
	return false;
	}
	Info.setAllowsRegister();
	Name++; // Skip over 'w'.
	break;
	case 'h': // `MQ', `CTR', or `LINK' register
	case 'q': // `MQ' register
	case 'c': // `CTR' register
	case 'l': // `LINK' register
	case 'x': // `CR' register (condition register) number 0
	case 'y': // `CR' register (condition register)
	case 'z': // `XER[CA]' carry bit (part of the XER register)
	Info.setAllowsRegister();
	break;
	case 'I': // Signed 16-bit constant
	case 'J': // Unsigned 16-bit constant shifted left 16 bits
	// (use `L' instead for SImode constants)
	case 'K': // Unsigned 16-bit constant
	case 'L': // Signed 16-bit constant shifted left 16 bits
	case 'M': // Constant larger than 31
	case 'N': // Exact power of 2
	case 'P': // Constant whose negation is a signed 16-bit constant
	case 'G': // Floating point constant that can be loaded into a
	// register with one instruction per word
	case 'H': // Integer/Floating point constant that can be loaded
	// into a register using three instructions
	break;
	case 'm': // Memory operand. Note that on PowerPC targets, m can
	// include addresses that update the base register. It
	// is therefore only safe to use `m' in an asm statement
	// if that asm statement accesses the operand exactly once.
	// The asm statement must also use `%U<opno>' as a
	// placeholder for the "update" flag in the corresponding
	// load or store instruction. For example:
	// asm ("st%U0 %1,%0" : "=m" (mem) : "r" (val));
	// is correct but:
	// asm ("st %1,%0" : "=m" (mem) : "r" (val));
	// is not. Use es rather than m if you don't want the base
	// register to be updated.
	case 'e':
	if (Name[1] != 's')
	return false;
	// es: A "stable" memory operand; that is, one which does not
	// include any automodification of the base register. Unlike
	// `m', this constraint can be used in asm statements that
	// might access the operand several times, or that might not
	// access it at all.
	Info.setAllowsMemory();
	Name++; // Skip over 'e'.
	break;
	case 'Q': // Memory operand that is an offset from a register (it is
	// usually better to use `m' or `es' in asm statements)
	Info.setAllowsRegister();
	LLVM_FALLTHROUGH;
	case 'Z': // Memory operand that is an indexed or indirect from a
	// register (it is usually better to use `m' or `es' in
	// asm statements)
	Info.setAllowsMemory();
	break;
	case 'R': // AIX TOC entry
	case 'a': // Address operand that is an indexed or indirect from a
	// register (`p' is preferable for asm statements)
	case 'S': // Constant suitable as a 64-bit mask operand
	case 'T': // Constant suitable as a 32-bit mask operand
	case 'U': // System V Release 4 small data area reference
	case 't': // AND masks that can be performed by two rldic{l, r}
	// instructions
	case 'W': // Vector constant that does not require memory
	case 'j': // Vector constant that is all zeros.
	break;
	// End FIXME.
	}
	return true;
	}

	std::string convertConstraint(const char *&Constraint) const override {
	std::string R;
	switch (*Constraint) {
	case 'e':
	case 'w':
	// Two-character constraint; add "^" hint for later parsing.
	R = std::string("^") + std::string(Constraint, 2);
	Constraint++;
	break;
	default:
	return TargetInfo::convertConstraint(Constraint);
	}
	return R;
	}

	const char *getClobbers() const override { return ""; }
	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0)
	return 3;
	if (RegNo == 1)
	return 4;
	return -1;
	}

	bool hasSjLjLowering() const override { return true; }

	const char *getLongDoubleMangling() const override {
	if (LongDoubleWidth == 64)
	return "e";
	return LongDoubleFormat == &llvm::APFloat::PPCDoubleDouble()
	? "g"
	: "u9__ieee128";
	}
	const char *getFloat128Mangling() const override { return "u9__ieee128"; }

	bool hasExtIntType() const override { return true; }

	bool isSPRegName(StringRef RegName) const override {
	return RegName.equals("r1") \|\| RegName.equals("x1");
	}
	};

	class LLVM_LIBRARY_VISIBILITY PPC32TargetInfo : public PPCTargetInfo {
	public:
	PPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: PPCTargetInfo(Triple, Opts) {
	if (Triple.isOSAIX())
	resetDataLayout("E-m:a-p:32:32-i64:64-n32");
	else
	resetDataLayout("E-m:e-p:32:32-i64:64-n32");

	switch (getTriple().getOS()) {
	case llvm::Triple::Linux:
	case llvm::Triple::FreeBSD:
	case llvm::Triple::NetBSD:
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	break;
	case llvm::Triple::AIX:
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	SuitableAlign = 64;
	break;
	default:
	break;
	}

	if (Triple.isOSFreeBSD() \|\| Triple.isOSNetBSD() \|\| Triple.isOSOpenBSD() \|\|
	Triple.getOS() == llvm::Triple::AIX \|\| Triple.isMusl()) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	// PPC32 supports atomics up to 4 bytes.
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	// This is the ELF definition, and is overridden by the Darwin sub-target
	return TargetInfo::PowerABIBuiltinVaList;
	}
	};

	// Note: ABI differences may eventually require us to have a separate
	// TargetInfo for little endian.
	class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
	public:
	PPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: PPCTargetInfo(Triple, Opts) {
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	IntMaxType = SignedLong;
	Int64Type = SignedLong;

	if (Triple.isOSAIX()) {
	// TODO: Set appropriate ABI for AIX platform.
	resetDataLayout("E-m:a-i64:64-n32:64");
	SuitableAlign = 64;
	} else if ((Triple.getArch() == llvm::Triple::ppc64le)) {
	resetDataLayout("e-m:e-i64:64-n32:64");
	ABI = "elfv2";
	} else {
	resetDataLayout("E-m:e-i64:64-n32:64");
	ABI = "elfv1";
	}

	- if (Triple.isOSFreeBSD() \|\| Triple.getOS() == llvm::Triple::AIX \|\|
	- Triple.isMusl()) {
	+ if (Triple.isOSFreeBSD() \|\| Triple.isOSOpenBSD() \|\|
	+ Triple.getOS() == llvm::Triple::AIX \|\| Triple.isMusl()) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	// PPC64 supports atomics up to 8 bytes.
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	// PPC64 Linux-specific ABI options.
	bool setABI(const std::string &Name) override {
	if (Name == "elfv1" \|\| Name == "elfv1-qpx" \|\| Name == "elfv2") {
	ABI = Name;
	return true;
	}
	return false;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_Swift:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}
	};

	class LLVM_LIBRARY_VISIBILITY DarwinPPC32TargetInfo
	: public DarwinTargetInfo<PPC32TargetInfo> {
	public:
	DarwinPPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<PPC32TargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	BoolWidth = BoolAlign = 32; // XXX support -mone-byte-bool?
	PtrDiffType = SignedInt; // for http://llvm.org/bugs/show_bug.cgi?id=15726
	LongLongAlign = 32;
	resetDataLayout("E-m:o-p:32:32-f64:32:64-n32");
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	class LLVM_LIBRARY_VISIBILITY DarwinPPC64TargetInfo
	: public DarwinTargetInfo<PPC64TargetInfo> {
	public:
	DarwinPPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<PPC64TargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	resetDataLayout("E-m:o-i64:64-n32:64");
	}
	};

	class LLVM_LIBRARY_VISIBILITY AIXPPC32TargetInfo :
	public AIXTargetInfo<PPC32TargetInfo> {
	public:
	using AIXTargetInfo::AIXTargetInfo;
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	class LLVM_LIBRARY_VISIBILITY AIXPPC64TargetInfo :
	public AIXTargetInfo<PPC64TargetInfo> {
	public:
	using AIXTargetInfo::AIXTargetInfo;
	};

	} // namespace targets
	} // namespace clang
	#endif // LLVM_CLANG_LIB_BASIC_TARGETS_PPC_H
	diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/Sparc.cpp b/contrib/llvm-project/clang/lib/Basic/Targets/Sparc.cpp
	index 13aa964d4716..48f36c5ba1c6 100644
	--- a/contrib/llvm-project/clang/lib/Basic/Targets/Sparc.cpp
	+++ b/contrib/llvm-project/clang/lib/Basic/Targets/Sparc.cpp
	@@ -1,250 +1,255 @@
	//===--- Sparc.cpp - Implement Sparc target feature support ---------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements Sparc TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#include "Sparc.h"
	#include "Targets.h"
	#include "clang/Basic/MacroBuilder.h"
	#include "llvm/ADT/StringSwitch.h"

	using namespace clang;
	using namespace clang::targets;

	const char *const SparcTargetInfo::GCCRegNames[] = {
	// Integer registers
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
	"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21",
	"r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",

	// Floating-point registers
	"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10",
	"f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21",
	"f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", "f32",
	"f34", "f36", "f38", "f40", "f42", "f44", "f46", "f48", "f50", "f52", "f54",
	"f56", "f58", "f60", "f62",
	};

	ArrayRef<const char *> SparcTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias SparcTargetInfo::GCCRegAliases[] = {
	{{"g0"}, "r0"}, {{"g1"}, "r1"}, {{"g2"}, "r2"}, {{"g3"}, "r3"},
	{{"g4"}, "r4"}, {{"g5"}, "r5"}, {{"g6"}, "r6"}, {{"g7"}, "r7"},
	{{"o0"}, "r8"}, {{"o1"}, "r9"}, {{"o2"}, "r10"}, {{"o3"}, "r11"},
	{{"o4"}, "r12"}, {{"o5"}, "r13"}, {{"o6", "sp"}, "r14"}, {{"o7"}, "r15"},
	{{"l0"}, "r16"}, {{"l1"}, "r17"}, {{"l2"}, "r18"}, {{"l3"}, "r19"},
	{{"l4"}, "r20"}, {{"l5"}, "r21"}, {{"l6"}, "r22"}, {{"l7"}, "r23"},
	{{"i0"}, "r24"}, {{"i1"}, "r25"}, {{"i2"}, "r26"}, {{"i3"}, "r27"},
	{{"i4"}, "r28"}, {{"i5"}, "r29"}, {{"i6", "fp"}, "r30"}, {{"i7"}, "r31"},
	};

	ArrayRef<TargetInfo::GCCRegAlias> SparcTargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}

	bool SparcTargetInfo::hasFeature(StringRef Feature) const {
	return llvm::StringSwitch<bool>(Feature)
	.Case("softfloat", SoftFloat)
	.Case("sparc", true)
	.Default(false);
	}

	struct SparcCPUInfo {
	llvm::StringLiteral Name;
	SparcTargetInfo::CPUKind Kind;
	SparcTargetInfo::CPUGeneration Generation;
	};

	static constexpr SparcCPUInfo CPUInfo[] = {
	{{"v8"}, SparcTargetInfo::CK_V8, SparcTargetInfo::CG_V8},
	{{"supersparc"}, SparcTargetInfo::CK_SUPERSPARC, SparcTargetInfo::CG_V8},
	{{"sparclite"}, SparcTargetInfo::CK_SPARCLITE, SparcTargetInfo::CG_V8},
	{{"f934"}, SparcTargetInfo::CK_F934, SparcTargetInfo::CG_V8},
	{{"hypersparc"}, SparcTargetInfo::CK_HYPERSPARC, SparcTargetInfo::CG_V8},
	{{"sparclite86x"},
	SparcTargetInfo::CK_SPARCLITE86X,
	SparcTargetInfo::CG_V8},
	{{"sparclet"}, SparcTargetInfo::CK_SPARCLET, SparcTargetInfo::CG_V8},
	{{"tsc701"}, SparcTargetInfo::CK_TSC701, SparcTargetInfo::CG_V8},
	{{"v9"}, SparcTargetInfo::CK_V9, SparcTargetInfo::CG_V9},
	{{"ultrasparc"}, SparcTargetInfo::CK_ULTRASPARC, SparcTargetInfo::CG_V9},
	{{"ultrasparc3"}, SparcTargetInfo::CK_ULTRASPARC3, SparcTargetInfo::CG_V9},
	{{"niagara"}, SparcTargetInfo::CK_NIAGARA, SparcTargetInfo::CG_V9},
	{{"niagara2"}, SparcTargetInfo::CK_NIAGARA2, SparcTargetInfo::CG_V9},
	{{"niagara3"}, SparcTargetInfo::CK_NIAGARA3, SparcTargetInfo::CG_V9},
	{{"niagara4"}, SparcTargetInfo::CK_NIAGARA4, SparcTargetInfo::CG_V9},
	{{"ma2100"}, SparcTargetInfo::CK_MYRIAD2100, SparcTargetInfo::CG_V8},
	{{"ma2150"}, SparcTargetInfo::CK_MYRIAD2150, SparcTargetInfo::CG_V8},
	{{"ma2155"}, SparcTargetInfo::CK_MYRIAD2155, SparcTargetInfo::CG_V8},
	{{"ma2450"}, SparcTargetInfo::CK_MYRIAD2450, SparcTargetInfo::CG_V8},
	{{"ma2455"}, SparcTargetInfo::CK_MYRIAD2455, SparcTargetInfo::CG_V8},
	{{"ma2x5x"}, SparcTargetInfo::CK_MYRIAD2x5x, SparcTargetInfo::CG_V8},
	{{"ma2080"}, SparcTargetInfo::CK_MYRIAD2080, SparcTargetInfo::CG_V8},
	{{"ma2085"}, SparcTargetInfo::CK_MYRIAD2085, SparcTargetInfo::CG_V8},
	{{"ma2480"}, SparcTargetInfo::CK_MYRIAD2480, SparcTargetInfo::CG_V8},
	{{"ma2485"}, SparcTargetInfo::CK_MYRIAD2485, SparcTargetInfo::CG_V8},
	{{"ma2x8x"}, SparcTargetInfo::CK_MYRIAD2x8x, SparcTargetInfo::CG_V8},
	// FIXME: the myriad2[.n] spellings are obsolete,
	// but a grace period is needed to allow updating dependent builds.
	{{"myriad2"}, SparcTargetInfo::CK_MYRIAD2x5x, SparcTargetInfo::CG_V8},
	{{"myriad2.1"}, SparcTargetInfo::CK_MYRIAD2100, SparcTargetInfo::CG_V8},
	{{"myriad2.2"}, SparcTargetInfo::CK_MYRIAD2x5x, SparcTargetInfo::CG_V8},
	{{"myriad2.3"}, SparcTargetInfo::CK_MYRIAD2x8x, SparcTargetInfo::CG_V8},
	{{"leon2"}, SparcTargetInfo::CK_LEON2, SparcTargetInfo::CG_V8},
	{{"at697e"}, SparcTargetInfo::CK_LEON2_AT697E, SparcTargetInfo::CG_V8},
	{{"at697f"}, SparcTargetInfo::CK_LEON2_AT697F, SparcTargetInfo::CG_V8},
	{{"leon3"}, SparcTargetInfo::CK_LEON3, SparcTargetInfo::CG_V8},
	{{"ut699"}, SparcTargetInfo::CK_LEON3_UT699, SparcTargetInfo::CG_V8},
	{{"gr712rc"}, SparcTargetInfo::CK_LEON3_GR712RC, SparcTargetInfo::CG_V8},
	{{"leon4"}, SparcTargetInfo::CK_LEON4, SparcTargetInfo::CG_V8},
	{{"gr740"}, SparcTargetInfo::CK_LEON4_GR740, SparcTargetInfo::CG_V8},
	};

	SparcTargetInfo::CPUGeneration
	SparcTargetInfo::getCPUGeneration(CPUKind Kind) const {
	if (Kind == CK_GENERIC)
	return CG_V8;
	const SparcCPUInfo *Item = llvm::find_if(
	CPUInfo, [Kind](const SparcCPUInfo &Info) { return Info.Kind == Kind; });
	if (Item == std::end(CPUInfo))
	llvm_unreachable("Unexpected CPU kind");
	return Item->Generation;
	}

	SparcTargetInfo::CPUKind SparcTargetInfo::getCPUKind(StringRef Name) const {
	const SparcCPUInfo *Item = llvm::find_if(
	CPUInfo, [Name](const SparcCPUInfo &Info) { return Info.Name == Name; });

	if (Item == std::end(CPUInfo))
	return CK_GENERIC;
	return Item->Kind;
	}

	void SparcTargetInfo::fillValidCPUList(
	SmallVectorImpl<StringRef> &Values) const {
	for (const SparcCPUInfo &Info : CPUInfo)
	Values.push_back(Info.Name);
	}

	void SparcTargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	DefineStd(Builder, "sparc", Opts);
	Builder.defineMacro("__REGISTER_PREFIX__", "");

	if (SoftFloat)
	Builder.defineMacro("SOFT_FLOAT", "1");
	}

	void SparcV8TargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	SparcTargetInfo::getTargetDefines(Opts, Builder);
	switch (getCPUGeneration(CPU)) {
	case CG_V8:
	Builder.defineMacro("__sparcv8");
	if (getTriple().getOS() != llvm::Triple::Solaris)
	Builder.defineMacro("__sparcv8__");
	break;
	case CG_V9:
	Builder.defineMacro("__sparcv9");
	if (getTriple().getOS() != llvm::Triple::Solaris) {
	Builder.defineMacro("__sparcv9__");
	Builder.defineMacro("__sparc_v9__");
	}
	break;
	}
	if (getTriple().getVendor() == llvm::Triple::Myriad) {
	std::string MyriadArchValue, Myriad2Value;
	Builder.defineMacro("__sparc_v8__");
	Builder.defineMacro("__leon__");
	switch (CPU) {
	case CK_MYRIAD2100:
	MyriadArchValue = "__ma2100";
	Myriad2Value = "1";
	break;
	case CK_MYRIAD2150:
	MyriadArchValue = "__ma2150";
	Myriad2Value = "2";
	break;
	case CK_MYRIAD2155:
	MyriadArchValue = "__ma2155";
	Myriad2Value = "2";
	break;
	case CK_MYRIAD2450:
	MyriadArchValue = "__ma2450";
	Myriad2Value = "2";
	break;
	case CK_MYRIAD2455:
	MyriadArchValue = "__ma2455";
	Myriad2Value = "2";
	break;
	case CK_MYRIAD2x5x:
	Myriad2Value = "2";
	break;
	case CK_MYRIAD2080:
	MyriadArchValue = "__ma2080";
	Myriad2Value = "3";
	break;
	case CK_MYRIAD2085:
	MyriadArchValue = "__ma2085";
	Myriad2Value = "3";
	break;
	case CK_MYRIAD2480:
	MyriadArchValue = "__ma2480";
	Myriad2Value = "3";
	break;
	case CK_MYRIAD2485:
	MyriadArchValue = "__ma2485";
	Myriad2Value = "3";
	break;
	case CK_MYRIAD2x8x:
	Myriad2Value = "3";
	break;
	default:
	MyriadArchValue = "__ma2100";
	Myriad2Value = "1";
	break;
	}
	if (!MyriadArchValue.empty()) {
	Builder.defineMacro(MyriadArchValue, "1");
	Builder.defineMacro(MyriadArchValue + "__", "1");
	}
	if (Myriad2Value == "2") {
	Builder.defineMacro("__ma2x5x", "1");
	Builder.defineMacro("__ma2x5x__", "1");
	} else if (Myriad2Value == "3") {
	Builder.defineMacro("__ma2x8x", "1");
	Builder.defineMacro("__ma2x8x__", "1");
	}
	Builder.defineMacro("__myriad2__", Myriad2Value);
	Builder.defineMacro("__myriad2", Myriad2Value);
	}
	}

	void SparcV9TargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	SparcTargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__sparcv9");
	Builder.defineMacro("__arch64__");
	// Solaris doesn't need these variants, but the BSDs do.
	if (getTriple().getOS() != llvm::Triple::Solaris) {
	Builder.defineMacro("__sparc64__");
	Builder.defineMacro("__sparc_v9__");
	Builder.defineMacro("__sparcv9__");
	}
	+
	+ Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	+ Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	+ Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	+ Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
	}

	void SparcV9TargetInfo::fillValidCPUList(
	SmallVectorImpl<StringRef> &Values) const {
	for (const SparcCPUInfo &Info : CPUInfo)
	if (Info.Generation == CG_V9)
	Values.push_back(Info.Name);
	}
	diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.cpp b/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.cpp
	index 6746768090f5..dcb3d8fd7790 100644
	--- a/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.cpp
	+++ b/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.cpp
	@@ -1,264 +1,266 @@
	//===--- WebAssembly.cpp - Implement WebAssembly target feature support ---===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements WebAssembly TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#include "WebAssembly.h"
	#include "Targets.h"
	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/Diagnostic.h"
	#include "clang/Basic/TargetBuiltins.h"
	#include "llvm/ADT/StringSwitch.h"

	using namespace clang;
	using namespace clang::targets;

	const Builtin::Info WebAssemblyTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr},
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE},
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{#ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr},
	#include "clang/Basic/BuiltinsWebAssembly.def"
	};

	static constexpr llvm::StringLiteral ValidCPUNames[] = {
	{"mvp"}, {"bleeding-edge"}, {"generic"}};

	StringRef WebAssemblyTargetInfo::getABI() const { return ABI; }

	bool WebAssemblyTargetInfo::setABI(const std::string &Name) {
	if (Name != "mvp" && Name != "experimental-mv")
	return false;

	ABI = Name;
	return true;
	}

	bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const {
	return llvm::StringSwitch<bool>(Feature)
	.Case("simd128", SIMDLevel >= SIMD128)
	.Case("unimplemented-simd128", SIMDLevel >= UnimplementedSIMD128)
	.Case("nontrapping-fptoint", HasNontrappingFPToInt)
	.Case("sign-ext", HasSignExt)
	.Case("exception-handling", HasExceptionHandling)
	.Case("bulk-memory", HasBulkMemory)
	.Case("atomics", HasAtomics)
	.Case("mutable-globals", HasMutableGlobals)
	.Case("multivalue", HasMultivalue)
	.Case("tail-call", HasTailCall)
	.Case("reference-types", HasReferenceTypes)
	.Default(false);
	}

	bool WebAssemblyTargetInfo::isValidCPUName(StringRef Name) const {
	return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames);
	}

	void WebAssemblyTargetInfo::fillValidCPUList(
	SmallVectorImpl<StringRef> &Values) const {
	Values.append(std::begin(ValidCPUNames), std::end(ValidCPUNames));
	}

	void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	defineCPUMacros(Builder, "wasm", /Tuning=/false);
	if (SIMDLevel >= SIMD128)
	Builder.defineMacro("__wasm_simd128__");
	if (SIMDLevel >= UnimplementedSIMD128)
	Builder.defineMacro("__wasm_unimplemented_simd128__");
	if (HasNontrappingFPToInt)
	Builder.defineMacro("__wasm_nontrapping_fptoint__");
	if (HasSignExt)
	Builder.defineMacro("__wasm_sign_ext__");
	if (HasExceptionHandling)
	Builder.defineMacro("__wasm_exception_handling__");
	if (HasBulkMemory)
	Builder.defineMacro("__wasm_bulk_memory__");
	if (HasAtomics)
	Builder.defineMacro("__wasm_atomics__");
	if (HasMutableGlobals)
	Builder.defineMacro("__wasm_mutable_globals__");
	if (HasMultivalue)
	Builder.defineMacro("__wasm_multivalue__");
	if (HasTailCall)
	Builder.defineMacro("__wasm_tail_call__");
	if (HasReferenceTypes)
	Builder.defineMacro("__wasm_reference_types__");
	}

	void WebAssemblyTargetInfo::setSIMDLevel(llvm::StringMap<bool> &Features,
	- SIMDEnum Level) {
	+ SIMDEnum Level, bool Enabled) {
	+ if (Enabled) {
	+ switch (Level) {
	+ case UnimplementedSIMD128:
	+ Features["unimplemented-simd128"] = true;
	+ LLVM_FALLTHROUGH;
	+ case SIMD128:
	+ Features["simd128"] = true;
	+ LLVM_FALLTHROUGH;
	+ case NoSIMD:
	+ break;
	+ }
	+ return;
	+ }
	+
	switch (Level) {
	- case UnimplementedSIMD128:
	- Features["unimplemented-simd128"] = true;
	- LLVM_FALLTHROUGH;
	+ case NoSIMD:
	case SIMD128:
	- Features["simd128"] = true;
	+ Features["simd128"] = false;
	LLVM_FALLTHROUGH;
	- case NoSIMD:
	+ case UnimplementedSIMD128:
	+ Features["unimplemented-simd128"] = false;
	break;
	}
	}

	+void WebAssemblyTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
	+ StringRef Name,
	+ bool Enabled) const {
	+ if (Name == "simd128")
	+ setSIMDLevel(Features, SIMD128, Enabled);
	+ else if (Name == "unimplemented-simd128")
	+ setSIMDLevel(Features, UnimplementedSIMD128, Enabled);
	+ else
	+ Features[Name] = Enabled;
	+}
	+
	bool WebAssemblyTargetInfo::initFeatureMap(
	llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const {
	if (CPU == "bleeding-edge") {
	Features["nontrapping-fptoint"] = true;
	Features["sign-ext"] = true;
	Features["bulk-memory"] = true;
	Features["atomics"] = true;
	Features["mutable-globals"] = true;
	Features["tail-call"] = true;
	- setSIMDLevel(Features, SIMD128);
	+ setSIMDLevel(Features, SIMD128, true);
	}
	- // Other targets do not consider user-configured features here, but while we
	- // are actively developing new features it is useful to let user-configured
	- // features control availability of builtins
	- setSIMDLevel(Features, SIMDLevel);
	- if (HasNontrappingFPToInt)
	- Features["nontrapping-fptoint"] = true;
	- if (HasSignExt)
	- Features["sign-ext"] = true;
	- if (HasExceptionHandling)
	- Features["exception-handling"] = true;
	- if (HasBulkMemory)
	- Features["bulk-memory"] = true;
	- if (HasAtomics)
	- Features["atomics"] = true;
	- if (HasMutableGlobals)
	- Features["mutable-globals"] = true;
	- if (HasMultivalue)
	- Features["multivalue"] = true;
	- if (HasTailCall)
	- Features["tail-call"] = true;
	- if (HasReferenceTypes)
	- Features["reference-types"] = true;

	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}

	bool WebAssemblyTargetInfo::handleTargetFeatures(
	std::vector<std::string> &Features, DiagnosticsEngine &Diags) {
	for (const auto &Feature : Features) {
	if (Feature == "+simd128") {
	SIMDLevel = std::max(SIMDLevel, SIMD128);
	continue;
	}
	if (Feature == "-simd128") {
	SIMDLevel = std::min(SIMDLevel, SIMDEnum(SIMD128 - 1));
	continue;
	}
	if (Feature == "+unimplemented-simd128") {
	SIMDLevel = std::max(SIMDLevel, SIMDEnum(UnimplementedSIMD128));
	continue;
	}
	if (Feature == "-unimplemented-simd128") {
	SIMDLevel = std::min(SIMDLevel, SIMDEnum(UnimplementedSIMD128 - 1));
	continue;
	}
	if (Feature == "+nontrapping-fptoint") {
	HasNontrappingFPToInt = true;
	continue;
	}
	if (Feature == "-nontrapping-fptoint") {
	HasNontrappingFPToInt = false;
	continue;
	}
	if (Feature == "+sign-ext") {
	HasSignExt = true;
	continue;
	}
	if (Feature == "-sign-ext") {
	HasSignExt = false;
	continue;
	}
	if (Feature == "+exception-handling") {
	HasExceptionHandling = true;
	continue;
	}
	if (Feature == "-exception-handling") {
	HasExceptionHandling = false;
	continue;
	}
	if (Feature == "+bulk-memory") {
	HasBulkMemory = true;
	continue;
	}
	if (Feature == "-bulk-memory") {
	HasBulkMemory = false;
	continue;
	}
	if (Feature == "+atomics") {
	HasAtomics = true;
	continue;
	}
	if (Feature == "-atomics") {
	HasAtomics = false;
	continue;
	}
	if (Feature == "+mutable-globals") {
	HasMutableGlobals = true;
	continue;
	}
	if (Feature == "-mutable-globals") {
	HasMutableGlobals = false;
	continue;
	}
	if (Feature == "+multivalue") {
	HasMultivalue = true;
	continue;
	}
	if (Feature == "-multivalue") {
	HasMultivalue = false;
	continue;
	}
	if (Feature == "+tail-call") {
	HasTailCall = true;
	continue;
	}
	if (Feature == "-tail-call") {
	HasTailCall = false;
	continue;
	}
	if (Feature == "+reference-types") {
	HasReferenceTypes = true;
	continue;
	}
	if (Feature == "-reference-types") {
	HasReferenceTypes = false;
	continue;
	}

	Diags.Report(diag::err_opt_not_valid_with_opt)
	<< Feature << "-target-feature";
	return false;
	}
	return true;
	}

	ArrayRef<Builtin::Info> WebAssemblyTargetInfo::getTargetBuiltins() const {
	return llvm::makeArrayRef(BuiltinInfo, clang::WebAssembly::LastTSBuiltin -
	Builtin::FirstTSBuiltin);
	}

	void WebAssembly32TargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	WebAssemblyTargetInfo::getTargetDefines(Opts, Builder);
	defineCPUMacros(Builder, "wasm32", /Tuning=/false);
	}

	void WebAssembly64TargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	WebAssemblyTargetInfo::getTargetDefines(Opts, Builder);
	defineCPUMacros(Builder, "wasm64", /Tuning=/false);
	}
	diff --git a/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.h b/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.h
	index 77a2fe9ae117..0068ccb5d71f 100644
	--- a/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.h
	+++ b/contrib/llvm-project/clang/lib/Basic/Targets/WebAssembly.h
	@@ -1,173 +1,177 @@
	//=== WebAssembly.h - Declare WebAssembly target feature support - C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares WebAssembly TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_WEBASSEMBLY_H
	#define LLVM_CLANG_LIB_BASIC_TARGETS_WEBASSEMBLY_H

	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TargetOptions.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/Compiler.h"

	namespace clang {
	namespace targets {

	class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];

	enum SIMDEnum {
	NoSIMD,
	SIMD128,
	UnimplementedSIMD128,
	} SIMDLevel = NoSIMD;

	bool HasNontrappingFPToInt = false;
	bool HasSignExt = false;
	bool HasExceptionHandling = false;
	bool HasBulkMemory = false;
	bool HasAtomics = false;
	bool HasMutableGlobals = false;
	bool HasMultivalue = false;
	bool HasTailCall = false;
	bool HasReferenceTypes = false;

	std::string ABI;

	public:
	explicit WebAssemblyTargetInfo(const llvm::Triple &T, const TargetOptions &)
	: TargetInfo(T) {
	NoAsmVariants = true;
	SuitableAlign = 128;
	LargeArrayMinWidth = 128;
	LargeArrayAlign = 128;
	SimdDefaultAlign = 128;
	SigAtomicType = SignedLong;
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	// size_t being unsigned long for both wasm32 and wasm64 makes mangled names
	// more consistent between the two.
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	}

	StringRef getABI() const override;
	bool setABI(const std::string &Name) override;

	protected:
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	private:
	- static void setSIMDLevel(llvm::StringMap<bool> &Features, SIMDEnum Level);
	+ static void setSIMDLevel(llvm::StringMap<bool> &Features, SIMDEnum Level,
	+ bool Enabled);

	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override;
	bool hasFeature(StringRef Feature) const final;

	+ void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
	+ bool Enabled) const final;
	+
	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) final;

	bool isValidCPUName(StringRef Name) const final;
	void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const final;

	bool setCPU(const std::string &Name) final { return isValidCPUName(Name); }

	ArrayRef<Builtin::Info> getTargetBuiltins() const final;

	BuiltinVaListKind getBuiltinVaListKind() const final {
	return VoidPtrBuiltinVaList;
	}

	ArrayRef<const char *> getGCCRegNames() const final { return None; }

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const final {
	return None;
	}

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const final {
	return false;
	}

	const char *getClobbers() const final { return ""; }

	bool isCLZForZeroUndef() const final { return false; }

	bool hasInt128Type() const final { return true; }

	IntType getIntTypeByWidth(unsigned BitWidth, bool IsSigned) const final {
	// WebAssembly prefers long long for explicitly 64-bit integers.
	return BitWidth == 64 ? (IsSigned ? SignedLongLong : UnsignedLongLong)
	: TargetInfo::getIntTypeByWidth(BitWidth, IsSigned);
	}

	IntType getLeastIntTypeByWidth(unsigned BitWidth, bool IsSigned) const final {
	// WebAssembly uses long long for int_least64_t and int_fast64_t.
	return BitWidth == 64
	? (IsSigned ? SignedLongLong : UnsignedLongLong)
	: TargetInfo::getLeastIntTypeByWidth(BitWidth, IsSigned);
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_C:
	case CC_Swift:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	bool hasExtIntType() const override { return true; }

	bool hasProtectedVisibility() const override { return false; }
	};

	class LLVM_LIBRARY_VISIBILITY WebAssembly32TargetInfo
	: public WebAssemblyTargetInfo {
	public:
	explicit WebAssembly32TargetInfo(const llvm::Triple &T,
	const TargetOptions &Opts)
	: WebAssemblyTargetInfo(T, Opts) {
	resetDataLayout("e-m:e-p:32:32-i64:64-n32:64-S128");
	}

	protected:
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;
	};

	class LLVM_LIBRARY_VISIBILITY WebAssembly64TargetInfo
	: public WebAssemblyTargetInfo {
	public:
	explicit WebAssembly64TargetInfo(const llvm::Triple &T,
	const TargetOptions &Opts)
	: WebAssemblyTargetInfo(T, Opts) {
	LongAlign = LongWidth = 64;
	PointerAlign = PointerWidth = 64;
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	resetDataLayout("e-m:e-p:64:64-i64:64-n32:64-S128");
	}

	protected:
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;
	};
	} // namespace targets
	} // namespace clang
	#endif // LLVM_CLANG_LIB_BASIC_TARGETS_WEBASSEMBLY_H
	diff --git a/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
	index ac6ec742335c..1f79b33772f3 100644
	--- a/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
	+++ b/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
	@@ -1,5236 +1,5232 @@
	//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This provides a class for OpenMP runtime code generation specialized to NVPTX
	// targets.
	//
	//===----------------------------------------------------------------------===//

	#include "CGOpenMPRuntimeNVPTX.h"
	#include "CodeGenFunction.h"
	#include "clang/AST/Attr.h"
	#include "clang/AST/DeclOpenMP.h"
	#include "clang/AST/StmtOpenMP.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/Basic/Cuda.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/IR/IntrinsicsNVPTX.h"

	using namespace clang;
	using namespace CodeGen;
	using namespace llvm::omp;

	namespace {
	enum OpenMPRTLFunctionNVPTX {
	/// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
	/// int16_t RequiresOMPRuntime);
	OMPRTL_NVPTX__kmpc_kernel_init,
	/// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
	OMPRTL_NVPTX__kmpc_kernel_deinit,
	/// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
	/// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
	OMPRTL_NVPTX__kmpc_spmd_kernel_init,
	/// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
	OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
	/// Call to void __kmpc_kernel_prepare_parallel(void
	/// *outlined_function);
	OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
	/// Call to bool __kmpc_kernel_parallel(void **outlined_function);
	OMPRTL_NVPTX__kmpc_kernel_parallel,
	/// Call to void __kmpc_kernel_end_parallel();
	OMPRTL_NVPTX__kmpc_kernel_end_parallel,
	/// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
	/// global_tid);
	OMPRTL_NVPTX__kmpc_serialized_parallel,
	/// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
	/// global_tid);
	OMPRTL_NVPTX__kmpc_end_serialized_parallel,
	/// Call to int32_t __kmpc_shuffle_int32(int32_t element,
	/// int16_t lane_offset, int16_t warp_size);
	OMPRTL_NVPTX__kmpc_shuffle_int32,
	/// Call to int64_t __kmpc_shuffle_int64(int64_t element,
	/// int16_t lane_offset, int16_t warp_size);
	OMPRTL_NVPTX__kmpc_shuffle_int64,
	/// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
	/// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
	/// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
	/// lane_offset, int16_t shortCircuit),
	/// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num));
	OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
	/// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
	/// global_tid, void global_buffer, int32_t num_of_records, void
	/// reduce_data,
	/// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
	/// lane_offset, int16_t shortCircuit),
	/// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num), void
	/// (kmp_ListToGlobalCpyFctPtr)(void buffer, int idx, void *reduce_data),
	/// void (kmp_GlobalToListCpyFctPtr)(void buffer, int idx,
	/// void reduce_data), void (kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
	/// int idx, void reduce_data), void (kmp_GlobalToListRedFctPtr)(void
	/// buffer, int idx, void reduce_data));
	OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
	/// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
	OMPRTL_NVPTX__kmpc_end_reduce_nowait,
	/// Call to void __kmpc_data_sharing_init_stack();
	OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
	/// Call to void __kmpc_data_sharing_init_stack_spmd();
	OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
	/// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
	/// int16_t UseSharedMemory);
	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
	/// Call to void* __kmpc_data_sharing_push_stack(size_t size, int16_t
	/// UseSharedMemory);
	OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
	/// Call to void __kmpc_data_sharing_pop_stack(void *a);
	OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
	/// Call to void __kmpc_begin_sharing_variables(void ***args,
	/// size_t n_args);
	OMPRTL_NVPTX__kmpc_begin_sharing_variables,
	/// Call to void __kmpc_end_sharing_variables();
	OMPRTL_NVPTX__kmpc_end_sharing_variables,
	/// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
	OMPRTL_NVPTX__kmpc_get_shared_variables,
	/// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
	/// global_tid);
	OMPRTL_NVPTX__kmpc_parallel_level,
	/// Call to int8_t __kmpc_is_spmd_exec_mode();
	OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
	/// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
	/// const void buf, size_t size, int16_t is_shared, const void *res);
	OMPRTL_NVPTX__kmpc_get_team_static_memory,
	/// Call to void __kmpc_restore_team_static_memory(int16_t
	/// isSPMDExecutionMode, int16_t is_shared);
	OMPRTL_NVPTX__kmpc_restore_team_static_memory,
	/// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_barrier,
	/// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
	/// global_tid);
	OMPRTL__kmpc_barrier_simple_spmd,
	/// Call to int32_t __kmpc_warp_active_thread_mask(void);
	OMPRTL_NVPTX__kmpc_warp_active_thread_mask,
	/// Call to void __kmpc_syncwarp(int32_t Mask);
	OMPRTL_NVPTX__kmpc_syncwarp,
	};

	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
	class NVPTXActionTy final : public PrePostActionTy {
	llvm::FunctionCallee EnterCallee = nullptr;
	ArrayRef<llvm::Value *> EnterArgs;
	llvm::FunctionCallee ExitCallee = nullptr;
	ArrayRef<llvm::Value *> ExitArgs;
	bool Conditional = false;
	llvm::BasicBlock *ContBlock = nullptr;

	public:
	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
	ArrayRef<llvm::Value *> EnterArgs,
	llvm::FunctionCallee ExitCallee,
	ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
	: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
	ExitArgs(ExitArgs), Conditional(Conditional) {}
	void Enter(CodeGenFunction &CGF) override {
	llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
	if (Conditional) {
	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
	auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
	ContBlock = CGF.createBasicBlock("omp_if.end");
	// Generate the branch (If-stmt)
	CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
	CGF.EmitBlock(ThenBlock);
	}
	}
	void Done(CodeGenFunction &CGF) {
	// Emit the rest of blocks/branches
	CGF.EmitBranch(ContBlock);
	CGF.EmitBlock(ContBlock, true);
	}
	void Exit(CodeGenFunction &CGF) override {
	CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
	}
	};

	/// A class to track the execution mode when codegening directives within
	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
	/// to the target region and used by containing directives such as 'parallel'
	/// to emit optimized code.
	class ExecutionRuntimeModesRAII {
	private:
	CGOpenMPRuntimeNVPTX::ExecutionMode SavedExecMode =
	CGOpenMPRuntimeNVPTX::EM_Unknown;
	CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode;
	bool SavedRuntimeMode = false;
	bool *RuntimeMode = nullptr;

	public:
	/// Constructor for Non-SPMD mode.
	ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
	: ExecMode(ExecMode) {
	SavedExecMode = ExecMode;
	ExecMode = CGOpenMPRuntimeNVPTX::EM_NonSPMD;
	}
	/// Constructor for SPMD mode.
	ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
	bool &RuntimeMode, bool FullRuntimeMode)
	: ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
	SavedExecMode = ExecMode;
	SavedRuntimeMode = RuntimeMode;
	ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD;
	RuntimeMode = FullRuntimeMode;
	}
	~ExecutionRuntimeModesRAII() {
	ExecMode = SavedExecMode;
	if (RuntimeMode)
	*RuntimeMode = SavedRuntimeMode;
	}
	};

	/// GPU Configuration: This information can be derived from cuda registers,
	/// however, providing compile time constants helps generate more efficient
	/// code. For all practical purposes this is fine because the configuration
	/// is the same for all known NVPTX architectures.
	enum MachineConfiguration : unsigned {
	WarpSize = 32,
	/// Number of bits required to represent a lane identifier, which is
	/// computed as log_2(WarpSize).
	LaneIDBits = 5,
	LaneIDMask = WarpSize - 1,

	/// Global memory alignment for performance.
	GlobalMemoryAlignment = 128,

	/// Maximal size of the shared memory buffer.
	SharedMemorySize = 128,
	};

	static const ValueDecl getPrivateItem(const Expr RefExpr) {
	RefExpr = RefExpr->IgnoreParens();
	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
	Base = TempASE->getBase()->IgnoreParenImpCasts();
	RefExpr = Base;
	} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
	Base = TempOASE->getBase()->IgnoreParenImpCasts();
	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
	Base = TempASE->getBase()->IgnoreParenImpCasts();
	RefExpr = Base;
	}
	RefExpr = RefExpr->IgnoreParenImpCasts();
	if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
	return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
	const auto *ME = cast<MemberExpr>(RefExpr);
	return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
	}


	static RecordDecl *buildRecordForGlobalizedVars(
	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&MappedDeclsFields, int BufSize) {
	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
	return nullptr;
	SmallVector<VarsDataTy, 4> GlobalizedVars;
	for (const ValueDecl *D : EscapedDecls)
	GlobalizedVars.emplace_back(
	CharUnits::fromQuantity(std::max(
	C.getDeclAlign(D).getQuantity(),
	static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
	D);
	for (const ValueDecl *D : EscapedDeclsForTeams)
	GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
	llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
	return L.first > R.first;
	});

	// Build struct _globalized_locals_ty {
	// /* globalized vars */[WarSize] align (max(decl_align,
	// GlobalMemoryAlignment))
	// /* globalized vars */ for EscapedDeclsForTeams
	// };
	RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
	GlobalizedRD->startDefinition();
	llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
	for (const auto &Pair : GlobalizedVars) {
	const ValueDecl *VD = Pair.second;
	QualType Type = VD->getType();
	if (Type->isLValueReferenceType())
	Type = C.getPointerType(Type.getNonReferenceType());
	else
	Type = Type.getNonReferenceType();
	SourceLocation Loc = VD->getLocation();
	FieldDecl *Field;
	if (SingleEscaped.count(VD)) {
	Field = FieldDecl::Create(
	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
	/BW=/nullptr, /Mutable=/false,
	/InitStyle=/ICIS_NoInit);
	Field->setAccess(AS_public);
	if (VD->hasAttrs()) {
	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
	E(VD->getAttrs().end());
	I != E; ++I)
	Field->addAttr(*I);
	}
	} else {
	llvm::APInt ArraySize(32, BufSize);
	Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
	0);
	Field = FieldDecl::Create(
	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
	/BW=/nullptr, /Mutable=/false,
	/InitStyle=/ICIS_NoInit);
	Field->setAccess(AS_public);
	llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
	static_cast<CharUnits::QuantityType>(
	GlobalMemoryAlignment)));
	Field->addAttr(AlignedAttr::CreateImplicit(
	C, /IsAlignmentExpr=/true,
	IntegerLiteral::Create(C, Align,
	C.getIntTypeForBitwidth(32, /Signed=/0),
	SourceLocation()),
	{}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
	}
	GlobalizedRD->addDecl(Field);
	MappedDeclsFields.try_emplace(VD, Field);
	}
	GlobalizedRD->completeDefinition();
	return GlobalizedRD;
	}

	/// Get the list of variables that can escape their declaration context.
	class CheckVarsEscapingDeclContext final
	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
	CodeGenFunction &CGF;
	llvm::SetVector<const ValueDecl *> EscapedDecls;
	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
	llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
	RecordDecl *GlobalizedRD = nullptr;
	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
	bool AllEscaped = false;
	bool IsForCombinedParallelRegion = false;

	void markAsEscaped(const ValueDecl *VD) {
	// Do not globalize declare target variables.
	if (!isa<VarDecl>(VD) \|\|
	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
	return;
	VD = cast<ValueDecl>(VD->getCanonicalDecl());
	// Use user-specified allocation.
	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
	return;
	// Variables captured by value must be globalized.
	if (auto *CSI = CGF.CapturedStmtInfo) {
	if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
	// Check if need to capture the variable that was already captured by
	// value in the outer region.
	if (!IsForCombinedParallelRegion) {
	if (!FD->hasAttrs())
	return;
	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
	if (!Attr)
	return;
	if (((Attr->getCaptureKind() != OMPC_map) &&
	!isOpenMPPrivate(Attr->getCaptureKind())) \|\|
	((Attr->getCaptureKind() == OMPC_map) &&
	!FD->getType()->isAnyPointerType()))
	return;
	}
	if (!FD->getType()->isReferenceType()) {
	assert(!VD->getType()->isVariablyModifiedType() &&
	"Parameter captured by value with variably modified type");
	EscapedParameters.insert(VD);
	} else if (!IsForCombinedParallelRegion) {
	return;
	}
	}
	}
	if ((!CGF.CapturedStmtInfo \|\|
	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
	VD->getType()->isReferenceType())
	// Do not globalize variables with reference type.
	return;
	if (VD->getType()->isVariablyModifiedType())
	EscapedVariableLengthDecls.insert(VD);
	else
	EscapedDecls.insert(VD);
	}

	void VisitValueDecl(const ValueDecl *VD) {
	if (VD->getType()->isLValueReferenceType())
	markAsEscaped(VD);
	if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
	if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
	const bool SavedAllEscaped = AllEscaped;
	AllEscaped = VD->getType()->isLValueReferenceType();
	Visit(VarD->getInit());
	AllEscaped = SavedAllEscaped;
	}
	}
	}
	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
	ArrayRef<OMPClause *> Clauses,
	bool IsCombinedParallelRegion) {
	if (!S)
	return;
	for (const CapturedStmt::Capture &C : S->captures()) {
	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
	const ValueDecl *VD = C.getCapturedVar();
	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
	if (IsCombinedParallelRegion) {
	// Check if the variable is privatized in the combined construct and
	// those private copies must be shared in the inner parallel
	// directive.
	IsForCombinedParallelRegion = false;
	for (const OMPClause *C : Clauses) {
	if (!isOpenMPPrivate(C->getClauseKind()) \|\|
	C->getClauseKind() == OMPC_reduction \|\|
	C->getClauseKind() == OMPC_linear \|\|
	C->getClauseKind() == OMPC_private)
	continue;
	ArrayRef<const Expr *> Vars;
	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
	Vars = PC->getVarRefs();
	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
	Vars = PC->getVarRefs();
	else
	llvm_unreachable("Unexpected clause.");
	for (const auto *E : Vars) {
	const Decl *D =
	cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
	if (D == VD->getCanonicalDecl()) {
	IsForCombinedParallelRegion = true;
	break;
	}
	}
	if (IsForCombinedParallelRegion)
	break;
	}
	}
	markAsEscaped(VD);
	if (isa<OMPCapturedExprDecl>(VD))
	VisitValueDecl(VD);
	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
	}
	}
	}

	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
	assert(!GlobalizedRD &&
	"Record for globalized variables is built already.");
	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
	if (IsInTTDRegion)
	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
	else
	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
	GlobalizedRD = ::buildRecordForGlobalizedVars(
	CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
	MappedDeclsFields, WarpSize);
	}

	public:
	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
	ArrayRef<const ValueDecl *> TeamsReductions)
	: CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
	}
	virtual ~CheckVarsEscapingDeclContext() = default;
	void VisitDeclStmt(const DeclStmt *S) {
	if (!S)
	return;
	for (const Decl *D : S->decls())
	if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
	VisitValueDecl(VD);
	}
	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
	if (!D)
	return;
	if (!D->hasAssociatedStmt())
	return;
	if (const auto *S =
	dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
	// Do not analyze directives that do not actually require capturing,
	// like `omp for` or `omp simd` directives.
	llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
	getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
	if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
	VisitStmt(S->getCapturedStmt());
	return;
	}
	VisitOpenMPCapturedStmt(
	S, D->clauses(),
	CaptureRegions.back() == OMPD_parallel &&
	isOpenMPDistributeDirective(D->getDirectiveKind()));
	}
	}
	void VisitCapturedStmt(const CapturedStmt *S) {
	if (!S)
	return;
	for (const CapturedStmt::Capture &C : S->captures()) {
	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
	const ValueDecl *VD = C.getCapturedVar();
	markAsEscaped(VD);
	if (isa<OMPCapturedExprDecl>(VD))
	VisitValueDecl(VD);
	}
	}
	}
	void VisitLambdaExpr(const LambdaExpr *E) {
	if (!E)
	return;
	for (const LambdaCapture &C : E->captures()) {
	if (C.capturesVariable()) {
	if (C.getCaptureKind() == LCK_ByRef) {
	const ValueDecl *VD = C.getCapturedVar();
	markAsEscaped(VD);
	if (E->isInitCapture(&C) \|\| isa<OMPCapturedExprDecl>(VD))
	VisitValueDecl(VD);
	}
	}
	}
	}
	void VisitBlockExpr(const BlockExpr *E) {
	if (!E)
	return;
	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
	if (C.isByRef()) {
	const VarDecl *VD = C.getVariable();
	markAsEscaped(VD);
	if (isa<OMPCapturedExprDecl>(VD) \|\| VD->isInitCapture())
	VisitValueDecl(VD);
	}
	}
	}
	void VisitCallExpr(const CallExpr *E) {
	if (!E)
	return;
	for (const Expr *Arg : E->arguments()) {
	if (!Arg)
	continue;
	if (Arg->isLValue()) {
	const bool SavedAllEscaped = AllEscaped;
	AllEscaped = true;
	Visit(Arg);
	AllEscaped = SavedAllEscaped;
	} else {
	Visit(Arg);
	}
	}
	Visit(E->getCallee());
	}
	void VisitDeclRefExpr(const DeclRefExpr *E) {
	if (!E)
	return;
	const ValueDecl *VD = E->getDecl();
	if (AllEscaped)
	markAsEscaped(VD);
	if (isa<OMPCapturedExprDecl>(VD))
	VisitValueDecl(VD);
	else if (const auto *VarD = dyn_cast<VarDecl>(VD))
	if (VarD->isInitCapture())
	VisitValueDecl(VD);
	}
	void VisitUnaryOperator(const UnaryOperator *E) {
	if (!E)
	return;
	if (E->getOpcode() == UO_AddrOf) {
	const bool SavedAllEscaped = AllEscaped;
	AllEscaped = true;
	Visit(E->getSubExpr());
	AllEscaped = SavedAllEscaped;
	} else {
	Visit(E->getSubExpr());
	}
	}
	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
	if (!E)
	return;
	if (E->getCastKind() == CK_ArrayToPointerDecay) {
	const bool SavedAllEscaped = AllEscaped;
	AllEscaped = true;
	Visit(E->getSubExpr());
	AllEscaped = SavedAllEscaped;
	} else {
	Visit(E->getSubExpr());
	}
	}
	void VisitExpr(const Expr *E) {
	if (!E)
	return;
	bool SavedAllEscaped = AllEscaped;
	if (!E->isLValue())
	AllEscaped = false;
	for (const Stmt *Child : E->children())
	if (Child)
	Visit(Child);
	AllEscaped = SavedAllEscaped;
	}
	void VisitStmt(const Stmt *S) {
	if (!S)
	return;
	for (const Stmt *Child : S->children())
	if (Child)
	Visit(Child);
	}

	/// Returns the record that handles all the escaped local variables and used
	/// instead of their original storage.
	const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
	if (!GlobalizedRD)
	buildRecordForGlobalizedVars(IsInTTDRegion);
	return GlobalizedRD;
	}

	/// Returns the field in the globalized record for the escaped variable.
	const FieldDecl getFieldForGlobalizedVar(const ValueDecl VD) const {
	assert(GlobalizedRD &&
	"Record for globalized variables must be generated already.");
	auto I = MappedDeclsFields.find(VD);
	if (I == MappedDeclsFields.end())
	return nullptr;
	return I->getSecond();
	}

	/// Returns the list of the escaped local variables/parameters.
	ArrayRef<const ValueDecl *> getEscapedDecls() const {
	return EscapedDecls.getArrayRef();
	}

	/// Checks if the escaped local variable is actually a parameter passed by
	/// value.
	const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
	return EscapedParameters;
	}

	/// Returns the list of the escaped variables with the variably modified
	/// types.
	ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
	return EscapedVariableLengthDecls.getArrayRef();
	}
	};
	} // anonymous namespace

	/// Get the GPU warp size.
	static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
	return CGF.EmitRuntimeCall(
	llvm::Intrinsic::getDeclaration(
	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
	"nvptx_warp_size");
	}

	/// Get the id of the current thread on the GPU.
	static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
	return CGF.EmitRuntimeCall(
	llvm::Intrinsic::getDeclaration(
	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
	"nvptx_tid");
	}

	/// Get the id of the warp in the block.
	/// We assume that the warp size is 32, which is always the case
	/// on the NVPTX device, to generate more efficient code.
	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
	CGBuilderTy &Bld = CGF.Builder;
	return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
	}

	/// Get the id of the current lane in the Warp.
	/// We assume that the warp size is 32, which is always the case
	/// on the NVPTX device, to generate more efficient code.
	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
	CGBuilderTy &Bld = CGF.Builder;
	return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
	"nvptx_lane_id");
	}

	/// Get the maximum number of threads in a block of the GPU.
	static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
	return CGF.EmitRuntimeCall(
	llvm::Intrinsic::getDeclaration(
	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
	"nvptx_num_threads");
	}

	/// Get the value of the thread_limit clause in the teams directive.
	/// For the 'generic' execution mode, the runtime encodes thread_limit in
	/// the launch parameters, always starting thread_limit+warpSize threads per
	/// CTA. The threads in the last warp are reserved for master execution.
	/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
	static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
	bool IsInSPMDExecutionMode = false) {
	CGBuilderTy &Bld = CGF.Builder;
	return IsInSPMDExecutionMode
	? getNVPTXNumThreads(CGF)
	: Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
	"thread_limit");
	}

	/// Get the thread id of the OMP master thread.
	/// The master thread id is the first thread (lane) of the last warp in the
	/// GPU block. Warp size is assumed to be some power of 2.
	/// Thread id is 0 indexed.
	/// E.g: If NumThreads is 33, master id is 32.
	/// If NumThreads is 64, master id is 32.
	/// If NumThreads is 1024, master id is 992.
	static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
	CGBuilderTy &Bld = CGF.Builder;
	llvm::Value *NumThreads = getNVPTXNumThreads(CGF);

	// We assume that the warp size is a power of 2.
	llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));

	return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
	Bld.CreateNot(Mask), "master_tid");
	}

	CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
	CodeGenModule &CGM, SourceLocation Loc)
	: WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
	Loc(Loc) {
	createWorkerFunction(CGM);
	}

	void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
	CodeGenModule &CGM) {
	// Create an worker function with no arguments.

	WorkerFn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	/placeholder=/"_worker", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
	WorkerFn->setDoesNotRecurse();
	}

	CGOpenMPRuntimeNVPTX::ExecutionMode
	CGOpenMPRuntimeNVPTX::getExecutionMode() const {
	return CurrentExecutionMode;
	}

	static CGOpenMPRuntimeNVPTX::DataSharingMode
	getDataSharingMode(CodeGenModule &CGM) {
	return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
	: CGOpenMPRuntimeNVPTX::Generic;
	}

	/// Check for inner (nested) SPMD construct, if any
	static bool hasNestedSPMDDirective(ASTContext &Ctx,
	const OMPExecutableDirective &D) {
	const auto *CS = D.getInnermostCapturedStmt();
	const auto *Body =
	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);

	if (const auto *NestedDir =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
	switch (D.getDirectiveKind()) {
	case OMPD_target:
	if (isOpenMPParallelDirective(DKind))
	return true;
	if (DKind == OMPD_teams) {
	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
	/IgnoreCaptured=/true);
	if (!Body)
	return false;
	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
	if (const auto *NND =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	DKind = NND->getDirectiveKind();
	if (isOpenMPParallelDirective(DKind))
	return true;
	}
	}
	return false;
	case OMPD_target_teams:
	return isOpenMPParallelDirective(DKind);
	case OMPD_target_simd:
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	case OMPD_parallel:
	case OMPD_for:
	case OMPD_parallel_for:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_for_simd:
	case OMPD_parallel_for_simd:
	case OMPD_cancel:
	case OMPD_cancellation_point:
	case OMPD_ordered:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_task:
	case OMPD_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_taskgroup:
	case OMPD_atomic:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_teams:
	case OMPD_target_data:
	case OMPD_target_exit_data:
	case OMPD_target_enter_data:
	case OMPD_distribute:
	case OMPD_distribute_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_target_update:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_requires:
	case OMPD_unknown:
	default:
	llvm_unreachable("Unexpected directive.");
	}
	}

	return false;
	}

	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
	const OMPExecutableDirective &D) {
	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
	switch (DirectiveKind) {
	case OMPD_target:
	case OMPD_target_teams:
	return hasNestedSPMDDirective(Ctx, D);
	case OMPD_target_parallel:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	case OMPD_target_simd:
	case OMPD_target_teams_distribute_simd:
	return true;
	case OMPD_target_teams_distribute:
	return false;
	case OMPD_parallel:
	case OMPD_for:
	case OMPD_parallel_for:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_for_simd:
	case OMPD_parallel_for_simd:
	case OMPD_cancel:
	case OMPD_cancellation_point:
	case OMPD_ordered:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_task:
	case OMPD_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_taskgroup:
	case OMPD_atomic:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_teams:
	case OMPD_target_data:
	case OMPD_target_exit_data:
	case OMPD_target_enter_data:
	case OMPD_distribute:
	case OMPD_distribute_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_target_update:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_requires:
	case OMPD_unknown:
	default:
	break;
	}
	llvm_unreachable(
	"Unknown programming model for OpenMP directive on NVPTX target.");
	}

	/// Check if the directive is loops based and has schedule clause at all or has
	/// static scheduling.
	static bool hasStaticScheduling(const OMPExecutableDirective &D) {
	assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
	isOpenMPLoopDirective(D.getDirectiveKind()) &&
	"Expected loop-based directive.");
	return !D.hasClausesOfKind<OMPOrderedClause>() &&
	(!D.hasClausesOfKind<OMPScheduleClause>() \|\|
	llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
	[](const OMPScheduleClause *C) {
	return C->getScheduleKind() == OMPC_SCHEDULE_static;
	}));
	}

	/// Check for inner (nested) lightweight runtime construct, if any
	static bool hasNestedLightweightDirective(ASTContext &Ctx,
	const OMPExecutableDirective &D) {
	assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
	const auto *CS = D.getInnermostCapturedStmt();
	const auto *Body =
	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);

	if (const auto *NestedDir =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
	switch (D.getDirectiveKind()) {
	case OMPD_target:
	if (isOpenMPParallelDirective(DKind) &&
	isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
	hasStaticScheduling(*NestedDir))
	return true;
	if (DKind == OMPD_teams_distribute_simd \|\| DKind == OMPD_simd)
	return true;
	if (DKind == OMPD_parallel) {
	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
	/IgnoreCaptured=/true);
	if (!Body)
	return false;
	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
	if (const auto *NND =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	DKind = NND->getDirectiveKind();
	if (isOpenMPWorksharingDirective(DKind) &&
	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
	return true;
	}
	} else if (DKind == OMPD_teams) {
	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
	/IgnoreCaptured=/true);
	if (!Body)
	return false;
	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
	if (const auto *NND =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	DKind = NND->getDirectiveKind();
	if (isOpenMPParallelDirective(DKind) &&
	isOpenMPWorksharingDirective(DKind) &&
	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
	return true;
	if (DKind == OMPD_parallel) {
	Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
	/IgnoreCaptured=/true);
	if (!Body)
	return false;
	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
	if (const auto *NND =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	DKind = NND->getDirectiveKind();
	if (isOpenMPWorksharingDirective(DKind) &&
	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
	return true;
	}
	}
	}
	}
	return false;
	case OMPD_target_teams:
	if (isOpenMPParallelDirective(DKind) &&
	isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
	hasStaticScheduling(*NestedDir))
	return true;
	if (DKind == OMPD_distribute_simd \|\| DKind == OMPD_simd)
	return true;
	if (DKind == OMPD_parallel) {
	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
	/IgnoreCaptured=/true);
	if (!Body)
	return false;
	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
	if (const auto *NND =
	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
	DKind = NND->getDirectiveKind();
	if (isOpenMPWorksharingDirective(DKind) &&
	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
	return true;
	}
	}
	return false;
	case OMPD_target_parallel:
	if (DKind == OMPD_simd)
	return true;
	return isOpenMPWorksharingDirective(DKind) &&
	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
	case OMPD_target_teams_distribute:
	case OMPD_target_simd:
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	case OMPD_parallel:
	case OMPD_for:
	case OMPD_parallel_for:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_for_simd:
	case OMPD_parallel_for_simd:
	case OMPD_cancel:
	case OMPD_cancellation_point:
	case OMPD_ordered:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_task:
	case OMPD_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_taskgroup:
	case OMPD_atomic:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_teams:
	case OMPD_target_data:
	case OMPD_target_exit_data:
	case OMPD_target_enter_data:
	case OMPD_distribute:
	case OMPD_distribute_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_target_update:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_requires:
	case OMPD_unknown:
	default:
	llvm_unreachable("Unexpected directive.");
	}
	}

	return false;
	}

	/// Checks if the construct supports lightweight runtime. It must be SPMD
	/// construct + inner loop-based construct with static scheduling.
	static bool supportsLightweightRuntime(ASTContext &Ctx,
	const OMPExecutableDirective &D) {
	if (!supportsSPMDExecutionMode(Ctx, D))
	return false;
	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
	switch (DirectiveKind) {
	case OMPD_target:
	case OMPD_target_teams:
	case OMPD_target_parallel:
	return hasNestedLightweightDirective(Ctx, D);
	case OMPD_target_parallel_for:
	case OMPD_target_parallel_for_simd:
	case OMPD_target_teams_distribute_parallel_for:
	case OMPD_target_teams_distribute_parallel_for_simd:
	// (Last\|First)-privates must be shared in parallel region.
	return hasStaticScheduling(D);
	case OMPD_target_simd:
	case OMPD_target_teams_distribute_simd:
	return true;
	case OMPD_target_teams_distribute:
	return false;
	case OMPD_parallel:
	case OMPD_for:
	case OMPD_parallel_for:
	case OMPD_parallel_master:
	case OMPD_parallel_sections:
	case OMPD_for_simd:
	case OMPD_parallel_for_simd:
	case OMPD_cancel:
	case OMPD_cancellation_point:
	case OMPD_ordered:
	case OMPD_threadprivate:
	case OMPD_allocate:
	case OMPD_task:
	case OMPD_simd:
	case OMPD_sections:
	case OMPD_section:
	case OMPD_single:
	case OMPD_master:
	case OMPD_critical:
	case OMPD_taskyield:
	case OMPD_barrier:
	case OMPD_taskwait:
	case OMPD_taskgroup:
	case OMPD_atomic:
	case OMPD_flush:
	case OMPD_depobj:
	case OMPD_scan:
	case OMPD_teams:
	case OMPD_target_data:
	case OMPD_target_exit_data:
	case OMPD_target_enter_data:
	case OMPD_distribute:
	case OMPD_distribute_simd:
	case OMPD_distribute_parallel_for:
	case OMPD_distribute_parallel_for_simd:
	case OMPD_teams_distribute:
	case OMPD_teams_distribute_simd:
	case OMPD_teams_distribute_parallel_for:
	case OMPD_teams_distribute_parallel_for_simd:
	case OMPD_target_update:
	case OMPD_declare_simd:
	case OMPD_declare_variant:
	case OMPD_begin_declare_variant:
	case OMPD_end_declare_variant:
	case OMPD_declare_target:
	case OMPD_end_declare_target:
	case OMPD_declare_reduction:
	case OMPD_declare_mapper:
	case OMPD_taskloop:
	case OMPD_taskloop_simd:
	case OMPD_master_taskloop:
	case OMPD_master_taskloop_simd:
	case OMPD_parallel_master_taskloop:
	case OMPD_parallel_master_taskloop_simd:
	case OMPD_requires:
	case OMPD_unknown:
	default:
	break;
	}
	llvm_unreachable(
	"Unknown programming model for OpenMP directive on NVPTX target.");
	}

	void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
	StringRef ParentName,
	llvm::Function *&OutlinedFn,
	llvm::Constant *&OutlinedFnID,
	bool IsOffloadEntry,
	const RegionCodeGenTy &CodeGen) {
	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
	EntryFunctionState EST;
	WorkerFunctionState WST(CGM, D.getBeginLoc());
	Work.clear();
	WrapperFunctionsMap.clear();

	// Emit target region as a standalone region.
	class NVPTXPrePostActionTy : public PrePostActionTy {
	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
	CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;

	public:
	NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
	CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
	: EST(EST), WST(WST) {}
	void Enter(CodeGenFunction &CGF) override {
	auto &RT =
	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
	RT.emitNonSPMDEntryHeader(CGF, EST, WST);
	// Skip target region initialization.
	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
	}
	void Exit(CodeGenFunction &CGF) override {
	auto &RT =
	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
	RT.clearLocThreadIdInsertPt(CGF);
	RT.emitNonSPMDEntryFooter(CGF, EST);
	}
	} Action(EST, WST);
	CodeGen.setAction(Action);
	IsInTTDRegion = true;
	// Reserve place for the globalized memory.
	GlobalizedRecords.emplace_back();
	if (!KernelStaticGlobalized) {
	KernelStaticGlobalized = new llvm::GlobalVariable(
	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/false,
	llvm::GlobalValue::InternalLinkage,
	llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
	"_openmp_kernel_static_glob_rd$ptr", /InsertBefore=/nullptr,
	llvm::GlobalValue::NotThreadLocal,
	CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
	}
	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
	IsOffloadEntry, CodeGen);
	IsInTTDRegion = false;

	// Now change the name of the worker function to correspond to this target
	// region's entry function.
	WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));

	// Create the worker function
	emitWorkerFunction(WST);
	}

	// Setup NVPTX threads for master-worker OpenMP scheme.
	void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
	EntryFunctionState &EST,
	WorkerFunctionState &WST) {
	CGBuilderTy &Bld = CGF.Builder;

	llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
	llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
	llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
	EST.ExitBB = CGF.createBasicBlock(".exit");

	llvm::Value *IsWorker =
	Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
	Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);

	CGF.EmitBlock(WorkerBB);
	emitCall(CGF, WST.Loc, WST.WorkerFn);
	CGF.EmitBranch(EST.ExitBB);

	CGF.EmitBlock(MasterCheckBB);
	llvm::Value *IsMaster =
	Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
	Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);

	CGF.EmitBlock(MasterBB);
	IsInTargetMasterThreadRegion = true;
	// SEQUENTIAL (MASTER) REGION START
	// First action in sequential region:
	// Initialize the state of the OpenMP runtime library on the GPU.
	// TODO: Optimize runtime initialization and pass in correct value.
	llvm::Value *Args[] = {getThreadLimit(CGF),
	Bld.getInt16(/RequiresOMPRuntime=/1)};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);

	// For data sharing, we need to initialize the stack.
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_data_sharing_init_stack));

	emitGenericVarsProlog(CGF, WST.Loc);
	}

	void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
	EntryFunctionState &EST) {
	IsInTargetMasterThreadRegion = false;
	if (!CGF.HaveInsertPoint())
	return;

	emitGenericVarsEpilog(CGF);

	if (!EST.ExitBB)
	EST.ExitBB = CGF.createBasicBlock(".exit");

	llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
	CGF.EmitBranch(TerminateBB);

	CGF.EmitBlock(TerminateBB);
	// Signal termination condition.
	// TODO: Optimize runtime initialization and pass in correct value.
	llvm::Value Args[] = {CGF.Builder.getInt16(/IsOMPRuntimeInitialized=*/1)};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
	// Barrier to terminate worker threads.
	syncCTAThreads(CGF);
	// Master thread jumps to exit point.
	CGF.EmitBranch(EST.ExitBB);

	CGF.EmitBlock(EST.ExitBB);
	EST.ExitBB = nullptr;
	}

	void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
	StringRef ParentName,
	llvm::Function *&OutlinedFn,
	llvm::Constant *&OutlinedFnID,
	bool IsOffloadEntry,
	const RegionCodeGenTy &CodeGen) {
	ExecutionRuntimeModesRAII ModeRAII(
	CurrentExecutionMode, RequiresFullRuntime,
	CGM.getLangOpts().OpenMPCUDAForceFullRuntime \|\|
	!supportsLightweightRuntime(CGM.getContext(), D));
	EntryFunctionState EST;

	// Emit target region as a standalone region.
	class NVPTXPrePostActionTy : public PrePostActionTy {
	CGOpenMPRuntimeNVPTX &RT;
	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
	const OMPExecutableDirective &D;

	public:
	NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
	const OMPExecutableDirective &D)
	: RT(RT), EST(EST), D(D) {}
	void Enter(CodeGenFunction &CGF) override {
	RT.emitSPMDEntryHeader(CGF, EST, D);
	// Skip target region initialization.
	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
	}
	void Exit(CodeGenFunction &CGF) override {
	RT.clearLocThreadIdInsertPt(CGF);
	RT.emitSPMDEntryFooter(CGF, EST);
	}
	} Action(*this, EST, D);
	CodeGen.setAction(Action);
	IsInTTDRegion = true;
	// Reserve place for the globalized memory.
	GlobalizedRecords.emplace_back();
	if (!KernelStaticGlobalized) {
	KernelStaticGlobalized = new llvm::GlobalVariable(
	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/false,
	llvm::GlobalValue::InternalLinkage,
	llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
	"_openmp_kernel_static_glob_rd$ptr", /InsertBefore=/nullptr,
	llvm::GlobalValue::NotThreadLocal,
	CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
	}
	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
	IsOffloadEntry, CodeGen);
	IsInTTDRegion = false;
	}

	void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
	CodeGenFunction &CGF, EntryFunctionState &EST,
	const OMPExecutableDirective &D) {
	CGBuilderTy &Bld = CGF.Builder;

	// Setup BBs in entry function.
	llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
	EST.ExitBB = CGF.createBasicBlock(".exit");

	llvm::Value Args[] = {getThreadLimit(CGF, /IsInSPMDExecutionMode=*/true),
	/RequiresOMPRuntime=/
	Bld.getInt16(RequiresFullRuntime ? 1 : 0),
	/RequiresDataSharing=/Bld.getInt16(0)};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);

	if (RequiresFullRuntime) {
	// For data sharing, we need to initialize the stack.
	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
	}

	CGF.EmitBranch(ExecuteBB);

	CGF.EmitBlock(ExecuteBB);

	IsInTargetMasterThreadRegion = true;
	}

	void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
	EntryFunctionState &EST) {
	IsInTargetMasterThreadRegion = false;
	if (!CGF.HaveInsertPoint())
	return;

	if (!EST.ExitBB)
	EST.ExitBB = CGF.createBasicBlock(".exit");

	llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
	CGF.EmitBranch(OMPDeInitBB);

	CGF.EmitBlock(OMPDeInitBB);
	// DeInitialize the OMP state in the runtime; called by all active threads.
	llvm::Value Args[] = {/RequiresOMPRuntime=*/
	CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
	CGF.EmitBranch(EST.ExitBB);

	CGF.EmitBlock(EST.ExitBB);
	EST.ExitBB = nullptr;
	}

	// Create a unique global variable to indicate the execution mode of this target
	// region. The execution mode is either 'generic', or 'spmd' depending on the
	// target directive. This variable is picked up by the offload library to setup
	// the device appropriately before kernel launch. If the execution mode is
	// 'generic', the runtime reserves one warp for the master, otherwise, all
	// warps participate in parallel work.
	static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
	bool Mode) {
	auto *GVMode =
	new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /isConstant=/true,
	llvm::GlobalValue::WeakAnyLinkage,
	llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
	Twine(Name, "_exec_mode"));
	CGM.addCompilerUsedGlobal(GVMode);
	}

	void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
	ASTContext &Ctx = CGM.getContext();

	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
	WST.Loc, WST.Loc);
	emitWorkerLoop(CGF, WST);
	CGF.FinishFunction();
	}

	void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
	WorkerFunctionState &WST) {
	//
	// The workers enter this loop and wait for parallel work from the master.
	// When the master encounters a parallel region it sets up the work + variable
	// arguments, and wakes up the workers. The workers first check to see if
	// they are required for the parallel region, i.e., within the # of requested
	// parallel threads. The activated workers load the variable arguments and
	// execute the parallel work.
	//

	CGBuilderTy &Bld = CGF.Builder;

	llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
	llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
	llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
	llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
	llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");

	CGF.EmitBranch(AwaitBB);

	// Workers wait for work from master.
	CGF.EmitBlock(AwaitBB);
	// Wait for parallel work
	syncCTAThreads(CGF);

	Address WorkFn =
	CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /Name=/"work_fn");
	Address ExecStatus =
	CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /Name=/"exec_status");
	CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/C=/0));
	CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));

	// TODO: Optimize runtime initialization and pass in correct value.
	llvm::Value *Args[] = {WorkFn.getPointer()};
	llvm::Value *Ret = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
	Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);

	// On termination condition (workid == 0), exit loop.
	llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
	llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
	Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);

	// Activate requested workers.
	CGF.EmitBlock(SelectWorkersBB);
	llvm::Value *IsActive =
	Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
	Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);

	// Signal start of parallel region.
	CGF.EmitBlock(ExecuteBB);
	// Skip initialization.
	setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);

	// Process work items: outlined parallel functions.
	for (llvm::Function *W : Work) {
	// Try to match this outlined function.
	llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);

	llvm::Value *WorkFnMatch =
	Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");

	llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
	llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
	Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);

	// Execute this outlined function.
	CGF.EmitBlock(ExecuteFNBB);

	// Insert call to work function via shared wrapper. The shared
	// wrapper takes two arguments:
	// - the parallelism level;
	// - the thread ID;
	emitCall(CGF, WST.Loc, W,
	{Bld.getInt16(/ParallelLevel=/0), getThreadID(CGF, WST.Loc)});

	// Go to end of parallel region.
	CGF.EmitBranch(TerminateBB);

	CGF.EmitBlock(CheckNextBB);
	}
	// Default case: call to outlined function through pointer if the target
	// region makes a declare target call that may contain an orphaned parallel
	// directive.
	auto *ParallelFnTy =
	llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
	/isVarArg=/false);
	llvm::Value *WorkFnCast =
	Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
	// Insert call to work function via shared wrapper. The shared
	// wrapper takes two arguments:
	// - the parallelism level;
	// - the thread ID;
	emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
	{Bld.getInt16(/ParallelLevel=/0), getThreadID(CGF, WST.Loc)});
	// Go to end of parallel region.
	CGF.EmitBranch(TerminateBB);

	// Signal end of parallel region.
	CGF.EmitBlock(TerminateBB);
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
	llvm::None);
	CGF.EmitBranch(BarrierBB);

	// All active and inactive workers wait at a barrier after parallel region.
	CGF.EmitBlock(BarrierBB);
	// Barrier after parallel region.
	syncCTAThreads(CGF);
	CGF.EmitBranch(AwaitBB);

	// Exit target region.
	CGF.EmitBlock(ExitBB);
	// Skip initialization.
	clearLocThreadIdInsertPt(CGF);
	}

	/// Returns specified OpenMP runtime function for the current OpenMP
	/// implementation. Specialized for the NVPTX device.
	/// \param Function OpenMP runtime function.
	/// \return Specified function.
	llvm::FunctionCallee
	CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
	llvm::FunctionCallee RTLFn = nullptr;
	switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
	case OMPRTL_NVPTX__kmpc_kernel_init: {
	// Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
	// RequiresOMPRuntime);
	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
	break;
	}
	case OMPRTL_NVPTX__kmpc_kernel_deinit: {
	// Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
	llvm::Type *TypeParams[] = {CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
	break;
	}
	case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
	// Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
	// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
	break;
	}
	case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
	// Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
	llvm::Type *TypeParams[] = {CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
	break;
	}
	case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
	/// Build void __kmpc_kernel_prepare_parallel(
	/// void *outlined_function);
	llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
	break;
	}
	case OMPRTL_NVPTX__kmpc_kernel_parallel: {
	/// Build bool __kmpc_kernel_parallel(void **outlined_function);
	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
	llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
	auto *FnTy =
	llvm::FunctionType::get(RetTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
	break;
	}
	case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
	/// Build void __kmpc_kernel_end_parallel();
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
	break;
	}
	case OMPRTL_NVPTX__kmpc_serialized_parallel: {
	// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
	break;
	}
	case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
	// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
	break;
	}
	case OMPRTL_NVPTX__kmpc_shuffle_int32: {
	// Build int32_t __kmpc_shuffle_int32(int32_t element,
	// int16_t lane_offset, int16_t warp_size);
	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
	break;
	}
	case OMPRTL_NVPTX__kmpc_shuffle_int64: {
	// Build int64_t __kmpc_shuffle_int64(int64_t element,
	// int16_t lane_offset, int16_t warp_size);
	llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
	break;
	}
	case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
	// Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
	// kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
	// reduce_data, void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t
	// lane_id, int16_t lane_offset, int16_t Algorithm Version), void
	// (kmp_InterWarpCopyFctPtr)(void src, int warp_num));
	llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
	CGM.Int16Ty, CGM.Int16Ty};
	auto *ShuffleReduceFnTy =
	llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
	/isVarArg=/false);
	llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
	auto *InterWarpCopyFnTy =
	llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
	/isVarArg=/false);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
	CGM.Int32Ty,
	CGM.Int32Ty,
	CGM.SizeTy,
	CGM.VoidPtrTy,
	ShuffleReduceFnTy->getPointerTo(),
	InterWarpCopyFnTy->getPointerTo()};
	auto *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(
	FnTy, /Name=/"__kmpc_nvptx_parallel_reduce_nowait_v2");
	break;
	}
	case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
	// Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {CGM.Int32Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(
	FnTy, /Name=/"__kmpc_nvptx_end_reduce_nowait");
	break;
	}
	case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
	// Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
	// global_tid, void global_buffer, int32_t num_of_records, void
	// reduce_data,
	// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
	// lane_offset, int16_t shortCircuit),
	// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num), void
	// (kmp_ListToGlobalCpyFctPtr)(void buffer, int idx, void *reduce_data),
	// void (kmp_GlobalToListCpyFctPtr)(void buffer, int idx,
	// void reduce_data), void (kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
	// int idx, void reduce_data), void (kmp_GlobalToListRedFctPtr)(void
	// buffer, int idx, void reduce_data));
	llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
	CGM.Int16Ty, CGM.Int16Ty};
	auto *ShuffleReduceFnTy =
	llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
	/isVarArg=/false);
	llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
	auto *InterWarpCopyFnTy =
	llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
	/isVarArg=/false);
	llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
	CGM.VoidPtrTy};
	auto *GlobalListFnTy =
	llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
	/isVarArg=/false);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
	CGM.Int32Ty,
	CGM.VoidPtrTy,
	CGM.Int32Ty,
	CGM.VoidPtrTy,
	ShuffleReduceFnTy->getPointerTo(),
	InterWarpCopyFnTy->getPointerTo(),
	GlobalListFnTy->getPointerTo(),
	GlobalListFnTy->getPointerTo(),
	GlobalListFnTy->getPointerTo(),
	GlobalListFnTy->getPointerTo()};
	auto *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(
	FnTy, /Name=/"__kmpc_nvptx_teams_reduce_nowait_v2");
	break;
	}
	case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
	/// Build void __kmpc_data_sharing_init_stack();
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
	break;
	}
	case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
	/// Build void __kmpc_data_sharing_init_stack_spmd();
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
	RTLFn =
	CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
	break;
	}
	case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
	// Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
	// int16_t UseSharedMemory);
	llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(
	FnTy, /Name=/"__kmpc_data_sharing_coalesced_push_stack");
	break;
	}
	case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
	// Build void *__kmpc_data_sharing_push_stack(size_t size, int16_t
	// UseSharedMemory);
	llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(
	FnTy, /Name=/"__kmpc_data_sharing_push_stack");
	break;
	}
	case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
	// Build void __kmpc_data_sharing_pop_stack(void *a);
	llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy,
	/Name=/"__kmpc_data_sharing_pop_stack");
	break;
	}
	case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
	/// Build void __kmpc_begin_sharing_variables(void ***args,
	/// size_t n_args);
	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
	break;
	}
	case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
	/// Build void __kmpc_end_sharing_variables();
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
	break;
	}
	case OMPRTL_NVPTX__kmpc_get_shared_variables: {
	/// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
	break;
	}
	case OMPRTL_NVPTX__kmpc_parallel_level: {
	// Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
	break;
	}
	case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
	// Build int8_t __kmpc_is_spmd_exec_mode();
	auto FnTy = llvm::FunctionType::get(CGM.Int8Ty, /isVarArg=*/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
	break;
	}
	case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
	// Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
	// const void buf, size_t size, int16_t is_shared, const void *res);
	llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
	CGM.Int16Ty, CGM.VoidPtrPtrTy};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
	break;
	}
	case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
	// Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
	// int16_t is_shared);
	llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn =
	CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
	break;
	}
	case OMPRTL__kmpc_barrier: {
	// Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn =
	CGM.CreateConvergentRuntimeFunction(FnTy, /Name/ "__kmpc_barrier");
	break;
	}
	case OMPRTL__kmpc_barrier_simple_spmd: {
	// Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateConvergentRuntimeFunction(
	FnTy, /Name/ "__kmpc_barrier_simple_spmd");
	break;
	}
	case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: {
	// Build int32_t __kmpc_warp_active_thread_mask(void);
	auto *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /isVarArg=/false);
	RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask");
	break;
	}
	case OMPRTL_NVPTX__kmpc_syncwarp: {
	// Build void __kmpc_syncwarp(kmp_int32 Mask);
	auto *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /isVarArg=/false);
	RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_syncwarp");
	break;
	}
	}
	return RTLFn;
	}

	void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
	llvm::Constant *Addr,
	uint64_t Size, int32_t,
	llvm::GlobalValue::LinkageTypes) {
	// TODO: Add support for global variables on the device after declare target
	// support.
	if (!isa<llvm::Function>(Addr))
	return;
	llvm::Module &M = CGM.getModule();
	llvm::LLVMContext &Ctx = CGM.getLLVMContext();

	// Get "nvvm.annotations" metadata node
	llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");

	llvm::Metadata *MDVals[] = {
	llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
	llvm::ConstantAsMetadata::get(
	llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
	// Append metadata to nvvm.annotations
	MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
	}

	void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
	const OMPExecutableDirective &D, StringRef ParentName,
	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
	if (!IsOffloadEntry) // Nothing to do.
	return;

	assert(!ParentName.empty() && "Invalid target region parent name!");

	bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
	if (Mode)
	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
	CodeGen);
	else
	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
	CodeGen);

	setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
	}

	namespace {
	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
	/// Enum for accesseing the reserved_2 field of the ident_t struct.
	enum ModeFlagsTy : unsigned {
	/// Bit set to 1 when in SPMD mode.
	KMP_IDENT_SPMD_MODE = 0x01,
	/// Bit set to 1 when a simplified runtime is used.
	KMP_IDENT_SIMPLE_RT_MODE = 0x02,
	LLVM_MARK_AS_BITMASK_ENUM(/LargestValue=/KMP_IDENT_SIMPLE_RT_MODE)
	};

	/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
	static const ModeFlagsTy UndefinedMode =
	(~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
	} // anonymous namespace

	unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const {
	switch (getExecutionMode()) {
	case EM_SPMD:
	if (requiresFullRuntime())
	return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
	return KMP_IDENT_SPMD_MODE \| KMP_IDENT_SIMPLE_RT_MODE;
	case EM_NonSPMD:
	assert(requiresFullRuntime() && "Expected full runtime.");
	return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
	case EM_Unknown:
	return UndefinedMode;
	}
	llvm_unreachable("Unknown flags are requested.");
	}

	CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
	: CGOpenMPRuntime(CGM, "_", "$") {
	if (!CGM.getLangOpts().OpenMPIsDevice)
	llvm_unreachable("OpenMP NVPTX can only handle device code.");
	}

	void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
	ProcBindKind ProcBind,
	SourceLocation Loc) {
	// Do nothing in case of SPMD mode and L0 parallel.
	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
	return;

	CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
	}

	void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
	llvm::Value *NumThreads,
	SourceLocation Loc) {
	// Do nothing in case of SPMD mode and L0 parallel.
	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
	return;

	CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
	}

	void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
	const Expr *NumTeams,
	const Expr *ThreadLimit,
	SourceLocation Loc) {}

	llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
	// Emit target region as a standalone region.
	class NVPTXPrePostActionTy : public PrePostActionTy {
	bool &IsInParallelRegion;
	bool PrevIsInParallelRegion;

	public:
	NVPTXPrePostActionTy(bool &IsInParallelRegion)
	: IsInParallelRegion(IsInParallelRegion) {}
	void Enter(CodeGenFunction &CGF) override {
	PrevIsInParallelRegion = IsInParallelRegion;
	IsInParallelRegion = true;
	}
	void Exit(CodeGenFunction &CGF) override {
	IsInParallelRegion = PrevIsInParallelRegion;
	}
	} Action(IsInParallelRegion);
	CodeGen.setAction(Action);
	bool PrevIsInTTDRegion = IsInTTDRegion;
	IsInTTDRegion = false;
	bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
	IsInTargetMasterThreadRegion = false;
	auto *OutlinedFun =
	cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
	D, ThreadIDVar, InnermostKind, CodeGen));
	if (CGM.getLangOpts().Optimize) {
	OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
	OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
	OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
	}
	IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
	IsInTTDRegion = PrevIsInTTDRegion;
	if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
	!IsInParallelRegion) {
	llvm::Function *WrapperFun =
	createParallelDataSharingWrapper(OutlinedFun, D);
	WrapperFunctionsMap[OutlinedFun] = WrapperFun;
	}

	return OutlinedFun;
	}

	/// Get list of lastprivate variables from the teams distribute ... or
	/// teams {distribute ...} directives.
	static void
	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
	"expected teams directive.");
	const OMPExecutableDirective *Dir = &D;
	if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
	Ctx,
	D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
	/IgnoreCaptured=/true))) {
	Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
	if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
	Dir = nullptr;
	}
	}
	if (!Dir)
	return;
	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
	for (const Expr *E : C->getVarRefs())
	Vars.push_back(getPrivateItem(E));
	}
	}

	/// Get list of reduction variables from the teams ... directives.
	static void
	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
	"expected teams directive.");
	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
	for (const Expr *E : C->privates())
	Vars.push_back(getPrivateItem(E));
	}
	}

	llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
	SourceLocation Loc = D.getBeginLoc();

	const RecordDecl *GlobalizedRD = nullptr;
	llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
	// Globalize team reductions variable unconditionally in all modes.
	if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
	getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
	getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
	if (!LastPrivatesReductions.empty()) {
	GlobalizedRD = ::buildRecordForGlobalizedVars(
	CGM.getContext(), llvm::None, LastPrivatesReductions,
	MappedDeclsFields, WarpSize);
	}
	} else if (!LastPrivatesReductions.empty()) {
	assert(!TeamAndReductions.first &&
	"Previous team declaration is not expected.");
	TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
	std::swap(TeamAndReductions.second, LastPrivatesReductions);
	}

	// Emit target region as a standalone region.
	class NVPTXPrePostActionTy : public PrePostActionTy {
	SourceLocation &Loc;
	const RecordDecl *GlobalizedRD;
	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&MappedDeclsFields;

	public:
	NVPTXPrePostActionTy(
	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&MappedDeclsFields)
	: Loc(Loc), GlobalizedRD(GlobalizedRD),
	MappedDeclsFields(MappedDeclsFields) {}
	void Enter(CodeGenFunction &CGF) override {
	auto &Rt =
	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
	if (GlobalizedRD) {
	auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
	I->getSecond().GlobalRecord = GlobalizedRD;
	I->getSecond().MappedParams =
	std::make_unique<CodeGenFunction::OMPMapVars>();
	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
	for (const auto &Pair : MappedDeclsFields) {
	assert(Pair.getFirst()->isCanonicalDecl() &&
	"Expected canonical declaration");
	Data.insert(std::make_pair(Pair.getFirst(),
	MappedVarData(Pair.getSecond(),
	/IsOnePerTeam=/true)));
	}
	}
	Rt.emitGenericVarsProlog(CGF, Loc);
	}
	void Exit(CodeGenFunction &CGF) override {
	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
	.emitGenericVarsEpilog(CGF);
	}
	} Action(Loc, GlobalizedRD, MappedDeclsFields);
	CodeGen.setAction(Action);
	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
	D, ThreadIDVar, InnermostKind, CodeGen);
	if (CGM.getLangOpts().Optimize) {
	OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
	OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
	OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
	}

	return OutlinedFun;
	}

	void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
	SourceLocation Loc,
	bool WithSPMDCheck) {
	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
	getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
	return;

	CGBuilderTy &Bld = CGF.Builder;

	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
	if (I == FunctionGlobalizedDecls.end())
	return;
	if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
	QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
	QualType SecGlobalRecTy;

	// Recover pointer to this function's global record. The runtime will
	// handle the specifics of the allocation of the memory.
	// Use actual memory size of the record including the padding
	// for alignment purposes.
	unsigned Alignment =
	CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
	unsigned GlobalRecordSize =
	CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
	GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);

	llvm::PointerType *GlobalRecPtrTy =
	CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
	llvm::Value *GlobalRecCastAddr;
	llvm::Value *IsTTD = nullptr;
	if (!IsInTTDRegion &&
	(WithSPMDCheck \|\|
	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
	llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
	llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
	if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *ThreadID = getThreadID(CGF, Loc);
	llvm::Value *PL = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
	{RTLoc, ThreadID});
	IsTTD = Bld.CreateIsNull(PL);
	}
	llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
	Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(SPMDBB);
	Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
	CharUnits::fromQuantity(Alignment));
	CGF.EmitBranch(ExitBB);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(NonSPMDBB);
	llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
	if (const RecordDecl *SecGlobalizedVarsRecord =
	I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
	SecGlobalRecTy =
	CGM.getContext().getRecordType(SecGlobalizedVarsRecord);

	// Recover pointer to this function's global record. The runtime will
	// handle the specifics of the allocation of the memory.
	// Use actual memory size of the record including the padding
	// for alignment purposes.
	unsigned Alignment =
	CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
	unsigned GlobalRecordSize =
	CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
	GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
	Size = Bld.CreateSelect(
	IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
	}
	// TODO: allow the usage of shared memory to be controlled by
	// the user, for now, default to global.
	llvm::Value *GlobalRecordSizeArg[] = {
	Size, CGF.Builder.getInt16(/UseSharedMemory=/0)};
	llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
	GlobalRecordSizeArg);
	GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	GlobalRecValue, GlobalRecPtrTy);
	CGF.EmitBlock(ExitBB);
	auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
	/NumReservedValues=/2, "_select_stack");
	Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
	Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
	GlobalRecCastAddr = Phi;
	I->getSecond().GlobalRecordAddr = Phi;
	I->getSecond().IsInSPMDModeFlag = IsSPMD;
	} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
	assert(GlobalizedRecords.back().Records.size() < 2 &&
	"Expected less than 2 globalized records: one for target and one "
	"for teams.");
	unsigned Offset = 0;
	for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
	QualType RDTy = CGM.getContext().getRecordType(RD);
	unsigned Alignment =
	CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
	unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
	Offset =
	llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
	}
	unsigned Alignment =
	CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
	Offset = llvm::alignTo(Offset, Alignment);
	GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
	++GlobalizedRecords.back().RegionCounter;
	if (GlobalizedRecords.back().Records.size() == 1) {
	assert(KernelStaticGlobalized &&
	"Kernel static pointer must be initialized already.");
	auto *UseSharedMemory = new llvm::GlobalVariable(
	CGM.getModule(), CGM.Int16Ty, /isConstant=/true,
	llvm::GlobalValue::InternalLinkage, nullptr,
	"_openmp_static_kernel$is_shared");
	UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
	/DestWidth=/16, /Signed=/0);
	llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
	Address(UseSharedMemory,
	CGM.getContext().getTypeAlignInChars(Int16Ty)),
	/Volatile=/false, Int16Ty, Loc);
	auto *StaticGlobalized = new llvm::GlobalVariable(
	CGM.getModule(), CGM.Int8Ty, /isConstant=/false,
	llvm::GlobalValue::CommonLinkage, nullptr);
	auto *RecSize = new llvm::GlobalVariable(
	CGM.getModule(), CGM.SizeTy, /isConstant=/true,
	llvm::GlobalValue::InternalLinkage, nullptr,
	"_openmp_static_kernel$size");
	RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	llvm::Value *Ld = CGF.EmitLoadOfScalar(
	Address(RecSize, CGM.getSizeAlign()), /Volatile=/false,
	CGM.getContext().getSizeType(), Loc);
	llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	KernelStaticGlobalized, CGM.VoidPtrPtrTy);
	llvm::Value *GlobalRecordSizeArg[] = {
	llvm::ConstantInt::get(
	CGM.Int16Ty,
	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
	StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_get_team_static_memory),
	GlobalRecordSizeArg);
	GlobalizedRecords.back().Buffer = StaticGlobalized;
	GlobalizedRecords.back().RecSize = RecSize;
	GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
	GlobalizedRecords.back().Loc = Loc;
	}
	assert(KernelStaticGlobalized && "Global address must be set already.");
	Address FrameAddr = CGF.EmitLoadOfPointer(
	Address(KernelStaticGlobalized, CGM.getPointerAlign()),
	CGM.getContext()
	.getPointerType(CGM.getContext().VoidPtrTy)
	.castAs<PointerType>());
	llvm::Value *GlobalRecValue =
	Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
	I->getSecond().GlobalRecordAddr = GlobalRecValue;
	I->getSecond().IsInSPMDModeFlag = nullptr;
	GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
	} else {
	// TODO: allow the usage of shared memory to be controlled by
	// the user, for now, default to global.
	bool UseSharedMemory =
	IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
	llvm::Value *GlobalRecordSizeArg[] = {
	llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
	CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
	llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	IsInTTDRegion
	? OMPRTL_NVPTX__kmpc_data_sharing_push_stack
	: OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
	GlobalRecordSizeArg);
	GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	GlobalRecValue, GlobalRecPtrTy);
	I->getSecond().GlobalRecordAddr = GlobalRecValue;
	I->getSecond().IsInSPMDModeFlag = nullptr;
	}
	LValue Base =
	CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);

	// Emit the "global alloca" which is a GEP from the global declaration
	// record using the pointer returned by the runtime.
	LValue SecBase;
	decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
	if (IsTTD) {
	SecIt = I->getSecond().SecondaryLocalVarData->begin();
	llvm::PointerType *SecGlobalRecPtrTy =
	CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
	SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
	Bld.CreatePointerBitCastOrAddrSpaceCast(
	I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
	SecGlobalRecTy);
	}
	for (auto &Rec : I->getSecond().LocalVarData) {
	bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
	llvm::Value *ParValue;
	if (EscapedParam) {
	const auto *VD = cast<VarDecl>(Rec.first);
	LValue ParLVal =
	CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
	ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
	}
	LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
	// Emit VarAddr basing on lane-id if required.
	QualType VarTy;
	if (Rec.second.IsOnePerTeam) {
	VarTy = Rec.second.FD->getType();
	} else {
	llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
	VarAddr.getAddress(CGF).getPointer(),
	{Bld.getInt32(0), getNVPTXLaneID(CGF)});
	VarTy =
	Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
	VarAddr = CGF.MakeAddrLValue(
	Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
	AlignmentSource::Decl);
	}
	Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
	if (!IsInTTDRegion &&
	(WithSPMDCheck \|\|
	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
	assert(I->getSecond().IsInSPMDModeFlag &&
	"Expected unknown execution mode or required SPMD check.");
	if (IsTTD) {
	assert(SecIt->second.IsOnePerTeam &&
	"Secondary glob data must be one per team.");
	LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
	VarAddr.setAddress(
	Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
	VarAddr.getPointer(CGF)),
	VarAddr.getAlignment()));
	Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
	}
	Address GlobalPtr = Rec.second.PrivateAddr;
	Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
	Rec.second.PrivateAddr = Address(
	Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
	LocalAddr.getPointer(), GlobalPtr.getPointer()),
	LocalAddr.getAlignment());
	}
	if (EscapedParam) {
	const auto *VD = cast<VarDecl>(Rec.first);
	CGF.EmitStoreOfScalar(ParValue, VarAddr);
	I->getSecond().MappedParams->setVarAddr(CGF, VD,
	VarAddr.getAddress(CGF));
	}
	if (IsTTD)
	++SecIt;
	}
	}
	for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
	// Recover pointer to this function's global record. The runtime will
	// handle the specifics of the allocation of the memory.
	// Use actual memory size of the record including the padding
	// for alignment purposes.
	CGBuilderTy &Bld = CGF.Builder;
	llvm::Value *Size = CGF.getTypeSize(VD->getType());
	CharUnits Align = CGM.getContext().getDeclAlign(VD);
	Size = Bld.CreateNUWAdd(
	Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
	llvm::Value *AlignVal =
	llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
	Size = Bld.CreateUDiv(Size, AlignVal);
	Size = Bld.CreateNUWMul(Size, AlignVal);
	// TODO: allow the usage of shared memory to be controlled by
	// the user, for now, default to global.
	llvm::Value *GlobalRecordSizeArg[] = {
	Size, CGF.Builder.getInt16(/UseSharedMemory=/0)};
	llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
	GlobalRecordSizeArg);
	llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
	LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
	CGM.getContext().getDeclAlign(VD),
	AlignmentSource::Decl);
	I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
	Base.getAddress(CGF));
	I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
	}
	I->getSecond().MappedParams->apply(CGF);
	}

	void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
	bool WithSPMDCheck) {
	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
	getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
	return;

	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
	if (I != FunctionGlobalizedDecls.end()) {
	I->getSecond().MappedParams->restore(CGF);
	if (!CGF.HaveInsertPoint())
	return;
	for (llvm::Value *Addr :
	llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
	Addr);
	}
	if (I->getSecond().GlobalRecordAddr) {
	if (!IsInTTDRegion &&
	(WithSPMDCheck \|\|
	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
	CGBuilderTy &Bld = CGF.Builder;
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
	llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
	Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(NonSPMDBB);
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
	CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
	CGF.EmitBlock(ExitBB);
	} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
	assert(GlobalizedRecords.back().RegionCounter > 0 &&
	"region counter must be > 0.");
	--GlobalizedRecords.back().RegionCounter;
	// Emit the restore function only in the target region.
	if (GlobalizedRecords.back().RegionCounter == 0) {
	QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
	/DestWidth=/16, /Signed=/0);
	llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
	Address(GlobalizedRecords.back().UseSharedMemory,
	CGM.getContext().getTypeAlignInChars(Int16Ty)),
	/Volatile=/false, Int16Ty, GlobalizedRecords.back().Loc);
	llvm::Value *Args[] = {
	llvm::ConstantInt::get(
	CGM.Int16Ty,
	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
	IsInSharedMemory};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_restore_team_static_memory),
	Args);
	}
	} else {
	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
	I->getSecond().GlobalRecordAddr);
	}
	}
	}
	}

	void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
	const OMPExecutableDirective &D,
	SourceLocation Loc,
	llvm::Function *OutlinedFn,
	ArrayRef<llvm::Value *> CapturedVars) {
	if (!CGF.HaveInsertPoint())
	return;

	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
	/Name=/".zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
	OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
	}

	void CGOpenMPRuntimeNVPTX::emitParallelCall(
	CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
	if (!CGF.HaveInsertPoint())
	return;

	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
	emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
	else
	emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
	}

	void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
	CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
	llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);

	// Force inline this outlined function at its call site.
	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);

	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
	/Name=/".zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	// ThreadId for serialized parallels is 0.
	Address ThreadIDAddr = ZeroAddr;
	auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr](
	CodeGenFunction &CGF, PrePostActionTy &Action) {
	Action.Enter(CGF);

	Address ZeroAddr =
	CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
	/Name=/".bound.zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
	OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
	emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
	};
	auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
	PrePostActionTy &) {

	RegionCodeGenTy RCG(CodeGen);
	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *ThreadID = getThreadID(CGF, Loc);
	llvm::Value *Args[] = {RTLoc, ThreadID};

	NVPTXActionTy Action(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
	Args,
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
	Args);
	RCG.setAction(Action);
	RCG(CGF);
	};

	auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
	PrePostActionTy &Action) {
	CGBuilderTy &Bld = CGF.Builder;
	llvm::Function *WFn = WrapperFunctionsMap[Fn];
	assert(WFn && "Wrapper function does not exist!");
	llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);

	// Prepare for parallel region. Indicate the outlined function.
	llvm::Value *Args[] = {ID};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
	Args);

	// Create a private scope that will globalize the arguments
	// passed from the outside of the target region.
	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);

	// There's something to share.
	if (!CapturedVars.empty()) {
	// Prepare for parallel region. Indicate the outlined function.
	Address SharedArgs =
	CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
	llvm::Value *SharedArgsPtr = SharedArgs.getPointer();

	llvm::Value *DataSharingArgs[] = {
	SharedArgsPtr,
	llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_begin_sharing_variables),
	DataSharingArgs);

	// Store variable address in a list of references to pass to workers.
	unsigned Idx = 0;
	ASTContext &Ctx = CGF.getContext();
	Address SharedArgListAddress = CGF.EmitLoadOfPointer(
	SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
	.castAs<PointerType>());
	for (llvm::Value *V : CapturedVars) {
	Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
	llvm::Value *PtrV;
	if (V->getType()->isIntegerTy())
	PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
	else
	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
	CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,
	Ctx.getPointerType(Ctx.VoidPtrTy));
	++Idx;
	}
	}

	// Activate workers. This barrier is used by the master to signal
	// work for the workers.
	syncCTAThreads(CGF);

	// OpenMP [2.5, Parallel Construct, p.49]
	// There is an implied barrier at the end of a parallel region. After the
	// end of a parallel region, only the master thread of the team resumes
	// execution of the enclosing task region.
	//
	// The master waits at this barrier until all workers are done.
	syncCTAThreads(CGF);

	if (!CapturedVars.empty())
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));

	// Remember for post-processing in worker loop.
	Work.emplace_back(WFn);
	};

	auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
	CodeGenFunction &CGF, PrePostActionTy &Action) {
	if (IsInParallelRegion) {
	SeqGen(CGF, Action);
	} else if (IsInTargetMasterThreadRegion) {
	L0ParallelGen(CGF, Action);
	} else {
	// Check for master and then parallelism:
	// if (__kmpc_is_spmd_exec_mode() \|\| __kmpc_parallel_level(loc, gtid)) {
	// Serialized execution.
	// } else {
	// Worker call.
	// }
	CGBuilderTy &Bld = CGF.Builder;
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
	llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
	llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
	llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
	llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
	Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(ParallelCheckBB);
	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *ThreadID = getThreadID(CGF, Loc);
	llvm::Value *PL = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
	{RTLoc, ThreadID});
	llvm::Value *Res = Bld.CreateIsNotNull(PL);
	Bld.CreateCondBr(Res, SeqBB, MasterBB);
	CGF.EmitBlock(SeqBB);
	SeqGen(CGF, Action);
	CGF.EmitBranch(ExitBB);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(MasterBB);
	L0ParallelGen(CGF, Action);
	CGF.EmitBranch(ExitBB);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	// Emit the continuation block for code after the if.
	CGF.EmitBlock(ExitBB, /IsFinished=/true);
	}
	};

	if (IfCond) {
	emitIfClause(CGF, IfCond, LNParallelGen, SeqGen);
	} else {
	CodeGenFunction::RunCleanupsScope Scope(CGF);
	RegionCodeGenTy ThenRCG(LNParallelGen);
	ThenRCG(CGF);
	}
	}

	void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
	CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
	// Just call the outlined function to execute the parallel region.
	// OutlinedFn(&GTid, &zero, CapturedStruct);
	//
	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
	/Name=/".zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	// ThreadId for serialized parallels is 0.
	Address ThreadIDAddr = ZeroAddr;
	auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr](
	CodeGenFunction &CGF, PrePostActionTy &Action) {
	Action.Enter(CGF);

	Address ZeroAddr =
	CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
	/Name=/".bound.zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
	OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
	};
	auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
	PrePostActionTy &) {

	RegionCodeGenTy RCG(CodeGen);
	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *ThreadID = getThreadID(CGF, Loc);
	llvm::Value *Args[] = {RTLoc, ThreadID};

	NVPTXActionTy Action(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
	Args,
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
	Args);
	RCG.setAction(Action);
	RCG(CGF);
	};

	if (IsInTargetMasterThreadRegion) {
	// In the worker need to use the real thread id.
	ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
	RegionCodeGenTy RCG(CodeGen);
	RCG(CGF);
	} else {
	// If we are not in the target region, it is definitely L2 parallelism or
	// more, because for SPMD mode we always has L1 parallel level, sowe don't
	// need to check for orphaned directives.
	RegionCodeGenTy RCG(SeqGen);
	RCG(CGF);
	}
	}

	void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
	// Always emit simple barriers!
	if (!CGF.HaveInsertPoint())
	return;
	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
	// This function does not use parameters, so we can emit just default values.
	llvm::Value *Args[] = {
	llvm::ConstantPointerNull::get(
	cast<llvm::PointerType>(getIdentTyPointerTy())),
	llvm::ConstantInt::get(CGF.Int32Ty, /V=/0, /isSigned=/true)};
	llvm::CallInst *Call = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
	Call->setConvergent();
	}

	void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
	SourceLocation Loc,
	OpenMPDirectiveKind Kind, bool,
	bool) {
	// Always emit simple barriers!
	if (!CGF.HaveInsertPoint())
	return;
	// Build call __kmpc_cancel_barrier(loc, thread_id);
	unsigned Flags = getDefaultFlagsForBarriers(Kind);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
	getThreadID(CGF, Loc)};
	llvm::CallInst *Call = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args);
	Call->setConvergent();
	}

	void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
	CodeGenFunction &CGF, StringRef CriticalName,
	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
	const Expr *Hint) {
	llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
	llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
	llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");

	// Get the mask of active threads in the warp.
	llvm::Value *Mask = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
	// Fetch team-local id of the thread.
	llvm::Value *ThreadID = getNVPTXThreadID(CGF);

	// Get the width of the team.
	llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);

	// Initialize the counter variable for the loop.
	QualType Int32Ty =
	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/0);
	Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
	LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
	/isInit=/true);

	// Block checks if loop counter exceeds upper bound.
	CGF.EmitBlock(LoopBB);
	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
	CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);

	// Block tests which single thread should execute region, and which threads
	// should go straight to synchronisation point.
	CGF.EmitBlock(TestBB);
	CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
	llvm::Value *CmpThreadToCounter =
	CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
	CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);

	// Block emits the body of the critical region.
	CGF.EmitBlock(BodyBB);

	// Output the critical statement.
	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
	Hint);

	// After the body surrounded by the critical region, the single executing
	// thread will jump to the synchronisation point.
	// Block waits for all threads in current team to finish then increments the
	// counter variable and returns to the loop.
	CGF.EmitBlock(SyncBB);
	// Reconverge active threads in the warp.
	(void)CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask);

	llvm::Value *IncCounterVal =
	CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
	CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
	CGF.EmitBranch(LoopBB);

	// Block that is reached when all threads in the team complete the region.
	CGF.EmitBlock(ExitBB, /IsFinished=/true);
	}

	/// Cast value to the specified type.
	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
	QualType ValTy, QualType CastTy,
	SourceLocation Loc) {
	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
	"Cast type must sized.");
	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
	"Val type must sized.");
	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
	if (ValTy == CastTy)
	return Val;
	if (CGF.getContext().getTypeSizeInChars(ValTy) ==
	CGF.getContext().getTypeSizeInChars(CastTy))
	return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
	if (CastTy->isIntegerType() && ValTy->isIntegerType())
	return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
	CastTy->hasSignedIntegerRepresentation());
	Address CastItem = CGF.CreateMemTemp(CastTy);
	Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
	CGF.EmitStoreOfScalar(Val, ValCastItem, /Volatile=/false, ValTy);
	return CGF.EmitLoadOfScalar(CastItem, /Volatile=/false, CastTy, Loc);
	}

	/// This function creates calls to one of two shuffle functions to copy
	/// variables between lanes in a warp.
	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
	llvm::Value *Elem,
	QualType ElemType,
	llvm::Value *Offset,
	SourceLocation Loc) {
	CodeGenModule &CGM = CGF.CGM;
	CGBuilderTy &Bld = CGF.Builder;
	CGOpenMPRuntimeNVPTX &RT =
	(static_cast<CGOpenMPRuntimeNVPTX >(&CGM.getOpenMPRuntime()));

	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
	assert(Size.getQuantity() <= 8 &&
	"Unsupported bitwidth in shuffle instruction.");

	OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
	? OMPRTL_NVPTX__kmpc_shuffle_int32
	: OMPRTL_NVPTX__kmpc_shuffle_int64;

	// Cast all types to 32- or 64-bit values before calling shuffle routines.
	QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
	Size.getQuantity() <= 4 ? 32 : 64, /Signed=/1);
	llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
	llvm::Value *WarpSize =
	Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /isSigned=/true);

	llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
	RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});

	return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
	}

	static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
	Address DestAddr, QualType ElemType,
	llvm::Value *Offset, SourceLocation Loc) {
	CGBuilderTy &Bld = CGF.Builder;

	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
	// Create the loop over the big sized data.
	// ptr = (void*)Elem;
	// ptrEnd = (void*) Elem + 1;
	// Step = 8;
	// while (ptr + Step < ptrEnd)
	// shuffle((int64_t)*ptr);
	// Step = 4;
	// while (ptr + Step < ptrEnd)
	// shuffle((int32_t)*ptr);
	// ...
	Address ElemPtr = DestAddr;
	Address Ptr = SrcAddr;
	Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
	Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
	for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
	if (Size < CharUnits::fromQuantity(IntSize))
	continue;
	QualType IntType = CGF.getContext().getIntTypeForBitwidth(
	CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
	/Signed=/1);
	llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
	Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
	ElemPtr =
	Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
	if (Size.getQuantity() / IntSize > 1) {
	llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
	llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
	CGF.EmitBlock(PreCondBB);
	llvm::PHINode *PhiSrc =
	Bld.CreatePHI(Ptr.getType(), /NumReservedValues=/2);
	PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
	llvm::PHINode *PhiDest =
	Bld.CreatePHI(ElemPtr.getType(), /NumReservedValues=/2);
	PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
	Ptr = Address(PhiSrc, Ptr.getAlignment());
	ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
	llvm::Value *PtrDiff = Bld.CreatePtrDiff(
	PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
	Ptr.getPointer(), CGF.VoidPtrTy));
	Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
	ThenBB, ExitBB);
	CGF.EmitBlock(ThenBB);
	llvm::Value *Res = createRuntimeShuffleFunction(
	CGF, CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc),
	IntType, Offset, Loc);
	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType);
	Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
	Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
	PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
	PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
	CGF.EmitBranch(PreCondBB);
	CGF.EmitBlock(ExitBB);
	} else {
	llvm::Value *Res = createRuntimeShuffleFunction(
	CGF, CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc),
	IntType, Offset, Loc);
	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType);
	Ptr = Bld.CreateConstGEP(Ptr, 1);
	ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
	}
	Size = Size % IntSize;
	}
	}

	namespace {
	enum CopyAction : unsigned {
	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
	// the warp using shuffle instructions.
	RemoteLaneToThread,
	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
	ThreadCopy,
	// ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
	ThreadToScratchpad,
	// ScratchpadToThread: Copy from a scratchpad array in global memory
	// containing team-reduced data to a thread's stack.
	ScratchpadToThread,
	};
	} // namespace

	struct CopyOptionsTy {
	llvm::Value *RemoteLaneOffset;
	llvm::Value *ScratchpadIndex;
	llvm::Value *ScratchpadWidth;
	};

	/// Emit instructions to copy a Reduce list, which contains partially
	/// aggregated values, in the specified direction.
	static void emitReductionListCopy(
	CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
	ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
	CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {

	CodeGenModule &CGM = CGF.CGM;
	ASTContext &C = CGM.getContext();
	CGBuilderTy &Bld = CGF.Builder;

	llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
	llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
	llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;

	// Iterates, element-by-element, through the source Reduce list and
	// make a copy.
	unsigned Idx = 0;
	unsigned Size = Privates.size();
	for (const Expr *Private : Privates) {
	Address SrcElementAddr = Address::invalid();
	Address DestElementAddr = Address::invalid();
	Address DestElementPtrAddr = Address::invalid();
	// Should we shuffle in an element from a remote lane?
	bool ShuffleInElement = false;
	// Set to true to update the pointer in the dest Reduce list to a
	// newly created element.
	bool UpdateDestListPtr = false;
	// Increment the src or dest pointer to the scratchpad, for each
	// new element.
	bool IncrScratchpadSrc = false;
	bool IncrScratchpadDest = false;

	switch (Action) {
	case RemoteLaneToThread: {
	// Step 1.1: Get the address for the src element in the Reduce list.
	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
	SrcElementAddr = CGF.EmitLoadOfPointer(
	SrcElementPtrAddr,
	C.getPointerType(Private->getType())->castAs<PointerType>());

	// Step 1.2: Create a temporary to store the element in the destination
	// Reduce list.
	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
	DestElementAddr =
	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
	ShuffleInElement = true;
	UpdateDestListPtr = true;
	break;
	}
	case ThreadCopy: {
	// Step 1.1: Get the address for the src element in the Reduce list.
	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
	SrcElementAddr = CGF.EmitLoadOfPointer(
	SrcElementPtrAddr,
	C.getPointerType(Private->getType())->castAs<PointerType>());

	// Step 1.2: Get the address for dest element. The destination
	// element has already been created on the thread's stack.
	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
	DestElementAddr = CGF.EmitLoadOfPointer(
	DestElementPtrAddr,
	C.getPointerType(Private->getType())->castAs<PointerType>());
	break;
	}
	case ThreadToScratchpad: {
	// Step 1.1: Get the address for the src element in the Reduce list.
	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
	SrcElementAddr = CGF.EmitLoadOfPointer(
	SrcElementPtrAddr,
	C.getPointerType(Private->getType())->castAs<PointerType>());

	// Step 1.2: Get the address for dest element:
	// address = base + index * ElementSizeInChars.
	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
	llvm::Value *CurrentOffset =
	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
	llvm::Value *ScratchPadElemAbsolutePtrVal =
	Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
	ScratchPadElemAbsolutePtrVal =
	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
	DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
	C.getTypeAlignInChars(Private->getType()));
	IncrScratchpadDest = true;
	break;
	}
	case ScratchpadToThread: {
	// Step 1.1: Get the address for the src element in the scratchpad.
	// address = base + index * ElementSizeInChars.
	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
	llvm::Value *CurrentOffset =
	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
	llvm::Value *ScratchPadElemAbsolutePtrVal =
	Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
	ScratchPadElemAbsolutePtrVal =
	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
	SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
	C.getTypeAlignInChars(Private->getType()));
	IncrScratchpadSrc = true;

	// Step 1.2: Create a temporary to store the element in the destination
	// Reduce list.
	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
	DestElementAddr =
	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
	UpdateDestListPtr = true;
	break;
	}
	}

	// Regardless of src and dest of copy, we emit the load of src
	// element as this is required in all directions
	SrcElementAddr = Bld.CreateElementBitCast(
	SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
	DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
	SrcElementAddr.getElementType());

	// Now that all active lanes have read the element in the
	// Reduce list, shuffle over the value from the remote lane.
	if (ShuffleInElement) {
	shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
	RemoteLaneOffset, Private->getExprLoc());
	} else {
	switch (CGF.getEvaluationKind(Private->getType())) {
	case TEK_Scalar: {
	llvm::Value *Elem =
	CGF.EmitLoadOfScalar(SrcElementAddr, /Volatile=/false,
	Private->getType(), Private->getExprLoc());
	// Store the source element value to the dest element address.
	CGF.EmitStoreOfScalar(Elem, DestElementAddr, /Volatile=/false,
	Private->getType());
	break;
	}
	case TEK_Complex: {
	CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
	Private->getExprLoc());
	CGF.EmitStoreOfComplex(
	Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
	/isInit=/false);
	break;
	}
	case TEK_Aggregate:
	CGF.EmitAggregateCopy(
	CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
	Private->getType(), AggValueSlot::DoesNotOverlap);
	break;
	}
	}

	// Step 3.1: Modify reference in dest Reduce list as needed.
	// Modifying the reference in Reduce list to point to the newly
	// created element. The element is live in the current function
	// scope and that of functions it invokes (i.e., reduce_function).
	// RemoteReduceData[i] = (void*)&RemoteElem
	if (UpdateDestListPtr) {
	CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
	DestElementAddr.getPointer(), CGF.VoidPtrTy),
	DestElementPtrAddr, /Volatile=/false,
	C.VoidPtrTy);
	}

	// Step 4.1: Increment SrcBase/DestBase so that it points to the starting
	// address of the next element in scratchpad memory, unless we're currently
	// processing the last one. Memory alignment is also taken care of here.
	if ((IncrScratchpadDest \|\| IncrScratchpadSrc) && (Idx + 1 < Size)) {
	llvm::Value *ScratchpadBasePtr =
	IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
	ScratchpadBasePtr = Bld.CreateNUWAdd(
	ScratchpadBasePtr,
	Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));

	// Take care of global memory alignment for performance
	ScratchpadBasePtr = Bld.CreateNUWSub(
	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
	ScratchpadBasePtr = Bld.CreateUDiv(
	ScratchpadBasePtr,
	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
	ScratchpadBasePtr = Bld.CreateNUWAdd(
	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
	ScratchpadBasePtr = Bld.CreateNUWMul(
	ScratchpadBasePtr,
	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));

	if (IncrScratchpadDest)
	DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
	else /* IncrScratchpadSrc = true */
	SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
	}

	++Idx;
	}
	}

	/// This function emits a helper that gathers Reduce lists from the first
	/// lane of every active warp to lanes in the first warp.
	///
	/// void inter_warp_copy_func(void* reduce_data, num_warps)
	/// shared smem[warp_size];
	/// For all data entries D in reduce_data:
	/// sync
	/// If (I am the first lane in each warp)
	/// Copy my local D to smem[warp_id]
	/// sync
	/// if (I am the first warp)
	/// Copy smem[thread_id] to my local D
	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
	ArrayRef<const Expr *> Privates,
	QualType ReductionArrayTy,
	SourceLocation Loc) {
	ASTContext &C = CGM.getContext();
	llvm::Module &M = CGM.getModule();

	// ReduceList: thread local Reduce list.
	// At the stage of the computation when this function is called, partially
	// aggregated values reside in the first lane of every active warp.
	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	// NumWarps: number of warps active in the parallel region. This could
	// be smaller than 32 (max warps in a CTA) for partial block reduction.
	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.getIntTypeForBitwidth(32, /* Signed */ true),
	ImplicitParamDecl::Other);
	FunctionArgList Args;
	Args.push_back(&ReduceListArg);
	Args.push_back(&NumWarpsArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
	llvm::GlobalValue::InternalLinkage,
	"_omp_reduction_inter_warp_copy_func", &M);
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setDoesNotRecurse();
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

	CGBuilderTy &Bld = CGF.Builder;

	// This array is used as a medium to transfer, one reduce element at a time,
	// the data from the first lane of every warp to lanes in the first warp
	// in order to perform the final step of a reduction in a parallel region
	// (reduction across warps). The array is placed in NVPTX __shared__ memory
	// for reduced latency, as well as to have a distinct copy for concurrently
	// executing target regions. The array is declared with common linkage so
	// as to be shared across compilation units.
	StringRef TransferMediumName =
	"__openmp_nvptx_data_transfer_temporary_storage";
	llvm::GlobalVariable *TransferMedium =
	M.getGlobalVariable(TransferMediumName);
	if (!TransferMedium) {
	auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
	unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
	TransferMedium = new llvm::GlobalVariable(
	M, Ty, /isConstant=/false, llvm::GlobalVariable::CommonLinkage,
	llvm::Constant::getNullValue(Ty), TransferMediumName,
	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
	SharedAddressSpace);
	CGM.addCompilerUsedGlobal(TransferMedium);
	}

	// Get the CUDA thread id of the current OpenMP thread on the GPU.
	llvm::Value *ThreadID = getNVPTXThreadID(CGF);
	// nvptx_lane_id = nvptx_id % warpsize
	llvm::Value *LaneID = getNVPTXLaneID(CGF);
	// nvptx_warp_id = nvptx_id / warpsize
	llvm::Value *WarpID = getNVPTXWarpID(CGF);

	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
	Address LocalReduceList(
	Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
	C.VoidPtrTy, Loc),
	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
	CGF.getPointerAlign());

	unsigned Idx = 0;
	for (const Expr *Private : Privates) {
	//
	// Warp master copies reduce element to transfer medium in __shared__
	// memory.
	//
	unsigned RealTySize =
	C.getTypeSizeInChars(Private->getType())
	.alignTo(C.getTypeAlignInChars(Private->getType()))
	.getQuantity();
	for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
	unsigned NumIters = RealTySize / TySize;
	if (NumIters == 0)
	continue;
	QualType CType = C.getIntTypeForBitwidth(
	C.toBits(CharUnits::fromQuantity(TySize)), /Signed=/1);
	llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
	CharUnits Align = CharUnits::fromQuantity(TySize);
	llvm::Value *Cnt = nullptr;
	Address CntAddr = Address::invalid();
	llvm::BasicBlock *PrecondBB = nullptr;
	llvm::BasicBlock *ExitBB = nullptr;
	if (NumIters > 1) {
	CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
	/Volatile=/false, C.IntTy);
	PrecondBB = CGF.createBasicBlock("precond");
	ExitBB = CGF.createBasicBlock("exit");
	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(PrecondBB);
	Cnt = CGF.EmitLoadOfScalar(CntAddr, /Volatile=/false, C.IntTy, Loc);
	llvm::Value *Cmp =
	Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
	Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
	CGF.EmitBlock(BodyBB);
	}
	// kmpc_barrier.
	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
	/EmitChecks=/false,
	/ForceSimpleCall=/true);
	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");

	// if (lane_id == 0)
	llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
	Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
	CGF.EmitBlock(ThenBB);

	// Reduce element = LocalReduceList[i]
	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
	// elemptr = ((CopyType*)(elemptrptr)) + I
	Address ElemPtr = Address(ElemPtrPtr, Align);
	ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
	if (NumIters > 1) {
	ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
	ElemPtr.getAlignment());
	}

	// Get pointer to location in transfer medium.
	// MediumPtr = &medium[warp_id]
	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
	TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
	Address MediumPtr(MediumPtrVal, Align);
	// Casting to actual data type.
	// MediumPtr = (CopyType*)MediumPtrAddr;
	MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);

	// elem = *elemptr
	//*MediumPtr = elem
	llvm::Value *Elem =
	CGF.EmitLoadOfScalar(ElemPtr, /Volatile=/false, CType, Loc);
	// Store the source element value to the dest element address.
	CGF.EmitStoreOfScalar(Elem, MediumPtr, /Volatile=/true, CType);

	Bld.CreateBr(MergeBB);

	CGF.EmitBlock(ElseBB);
	Bld.CreateBr(MergeBB);

	CGF.EmitBlock(MergeBB);

	// kmpc_barrier.
	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
	/EmitChecks=/false,
	/ForceSimpleCall=/true);

	//
	// Warp 0 copies reduce element from transfer medium.
	//
	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");

	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
	AddrNumWarpsArg, /Volatile=/false, C.IntTy, Loc);

	// Up to 32 threads in warp 0 are active.
	llvm::Value *IsActiveThread =
	Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
	Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);

	CGF.EmitBlock(W0ThenBB);

	// SrcMediumPtr = &medium[tid]
	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
	TransferMedium,
	{llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
	Address SrcMediumPtr(SrcMediumPtrVal, Align);
	// SrcMediumVal = *SrcMediumPtr;
	SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);

	// TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
	Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, Loc);
	Address TargetElemPtr = Address(TargetElemPtrVal, Align);
	TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
	if (NumIters > 1) {
	TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
	TargetElemPtr.getAlignment());
	}

	// *TargetElemPtr = SrcMediumVal;
	llvm::Value *SrcMediumValue =
	CGF.EmitLoadOfScalar(SrcMediumPtr, /Volatile=/true, CType, Loc);
	CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /Volatile=/false,
	CType);
	Bld.CreateBr(W0MergeBB);

	CGF.EmitBlock(W0ElseBB);
	Bld.CreateBr(W0MergeBB);

	CGF.EmitBlock(W0MergeBB);

	if (NumIters > 1) {
	Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /V=/1));
	CGF.EmitStoreOfScalar(Cnt, CntAddr, /Volatile=/false, C.IntTy);
	CGF.EmitBranch(PrecondBB);
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(ExitBB);
	}
	RealTySize %= TySize;
	}
	++Idx;
	}

	CGF.FinishFunction();
	return Fn;
	}

	/// Emit a helper that reduces data across two OpenMP threads (lanes)
	/// in the same warp. It uses shuffle instructions to copy over data from
	/// a remote lane's stack. The reduction algorithm performed is specified
	/// by the fourth parameter.
	///
	/// Algorithm Versions.
	/// Full Warp Reduce (argument value 0):
	/// This algorithm assumes that all 32 lanes are active and gathers
	/// data from these 32 lanes, producing a single resultant value.
	/// Contiguous Partial Warp Reduce (argument value 1):
	/// This algorithm assumes that only a contiguous subset of lanes
	/// are active. This happens for the last warp in a parallel region
	/// when the user specified num_threads is not an integer multiple of
	/// 32. This contiguous subset always starts with the zeroth lane.
	/// Partial Warp Reduce (argument value 2):
	/// This algorithm gathers data from any number of lanes at any position.
	/// All reduced values are stored in the lowest possible lane. The set
	/// of problems every algorithm addresses is a super set of those
	/// addressable by algorithms with a lower version number. Overhead
	/// increases as algorithm version increases.
	///
	/// Terminology
	/// Reduce element:
	/// Reduce element refers to the individual data field with primitive
	/// data types to be combined and reduced across threads.
	/// Reduce list:
	/// Reduce list refers to a collection of local, thread-private
	/// reduce elements.
	/// Remote Reduce list:
	/// Remote Reduce list refers to a collection of remote (relative to
	/// the current thread) reduce elements.
	///
	/// We distinguish between three states of threads that are important to
	/// the implementation of this function.
	/// Alive threads:
	/// Threads in a warp executing the SIMT instruction, as distinguished from
	/// threads that are inactive due to divergent control flow.
	/// Active threads:
	/// The minimal set of threads that has to be alive upon entry to this
	/// function. The computation is correct iff active threads are alive.
	/// Some threads are alive but they are not active because they do not
	/// contribute to the computation in any useful manner. Turning them off
	/// may introduce control flow overheads without any tangible benefits.
	/// Effective threads:
	/// In order to comply with the argument requirements of the shuffle
	/// function, we must keep all lanes holding data alive. But at most
	/// half of them perform value aggregation; we refer to this half of
	/// threads as effective. The other half is simply handing off their
	/// data.
	///
	/// Procedure
	/// Value shuffle:
	/// In this step active threads transfer data from higher lane positions
	/// in the warp to lower lane positions, creating Remote Reduce list.
	/// Value aggregation:
	/// In this step, effective threads combine their thread local Reduce list
	/// with Remote Reduce list and store the result in the thread local
	/// Reduce list.
	/// Value copy:
	/// In this step, we deal with the assumption made by algorithm 2
	/// (i.e. contiguity assumption). When we have an odd number of lanes
	/// active, say 2k+1, only k threads will be effective and therefore k
	/// new values will be produced. However, the Reduce list owned by the
	/// (2k+1)th thread is ignored in the value aggregation. Therefore
	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
	/// that the contiguity assumption still holds.
	static llvm::Function *emitShuffleAndReduceFunction(
	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
	QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
	ASTContext &C = CGM.getContext();

	// Thread local Reduce list used to host the values of data to be reduced.
	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	// Current lane id; could be logical.
	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.ShortTy,
	ImplicitParamDecl::Other);
	// Offset of the remote source lane relative to the current lane.
	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.ShortTy, ImplicitParamDecl::Other);
	// Algorithm version. This is expected to be known at compile time.
	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.ShortTy, ImplicitParamDecl::Other);
	FunctionArgList Args;
	Args.push_back(&ReduceListArg);
	Args.push_back(&LaneIDArg);
	Args.push_back(&RemoteLaneOffsetArg);
	Args.push_back(&AlgoVerArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setDoesNotRecurse();
	if (CGM.getLangOpts().Optimize) {
	Fn->removeFnAttr(llvm::Attribute::NoInline);
	Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
	Fn->addFnAttr(llvm::Attribute::AlwaysInline);
	}

	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

	CGBuilderTy &Bld = CGF.Builder;

	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
	Address LocalReduceList(
	Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
	C.VoidPtrTy, SourceLocation()),
	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
	CGF.getPointerAlign());

	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation());

	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation());

	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation());

	// Create a local thread-private variable to host the Reduce list
	// from a remote lane.
	Address RemoteReduceList =
	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");

	// This loop iterates through the list of reduce elements and copies,
	// element by element, from a remote lane in the warp to RemoteReduceList,
	// hosted on the thread's stack.
	emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
	LocalReduceList, RemoteReduceList,
	{/RemoteLaneOffset=/RemoteLaneOffsetArgVal,
	/ScratchpadIndex=/nullptr,
	/ScratchpadWidth=/nullptr});

	// The actions to be performed on the Remote Reduce list is dependent
	// on the algorithm version.
	//
	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
	// LaneId % 2 == 0 && Offset > 0):
	// do the reduction value aggregation
	//
	// The thread local variable Reduce list is mutated in place to host the
	// reduced data, which is the aggregated value produced from local and
	// remote lanes.
	//
	// Note that AlgoVer is expected to be a constant integer known at compile
	// time.
	// When AlgoVer==0, the first conjunction evaluates to true, making
	// the entire predicate true during compile time.
	// When AlgoVer==1, the second conjunction has only the second part to be
	// evaluated during runtime. Other conjunctions evaluates to false
	// during compile time.
	// When AlgoVer==2, the third conjunction has only the second part to be
	// evaluated during runtime. Other conjunctions evaluates to false
	// during compile time.
	llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);

	llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
	llvm::Value *CondAlgo1 = Bld.CreateAnd(
	Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));

	llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
	llvm::Value *CondAlgo2 = Bld.CreateAnd(
	Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
	CondAlgo2 = Bld.CreateAnd(
	CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));

	llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
	CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);

	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
	Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);

	CGF.EmitBlock(ThenBB);
	// reduce_function(LocalReduceList, RemoteReduceList)
	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	LocalReduceList.getPointer(), CGF.VoidPtrTy);
	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	RemoteReduceList.getPointer(), CGF.VoidPtrTy);
	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
	CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
	Bld.CreateBr(MergeBB);

	CGF.EmitBlock(ElseBB);
	Bld.CreateBr(MergeBB);

	CGF.EmitBlock(MergeBB);

	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
	// Reduce list.
	Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
	llvm::Value *CondCopy = Bld.CreateAnd(
	Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));

	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
	Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);

	CGF.EmitBlock(CpyThenBB);
	emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
	RemoteReduceList, LocalReduceList);
	Bld.CreateBr(CpyMergeBB);

	CGF.EmitBlock(CpyElseBB);
	Bld.CreateBr(CpyMergeBB);

	CGF.EmitBlock(CpyMergeBB);

	CGF.FinishFunction();
	return Fn;
	}

	/// This function emits a helper that copies all the reduction variables from
	/// the team into the provided global buffer for the reduction variables.
	///
	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
	/// For all data entries D in reduce_data:
	/// Copy local D to buffer.D[Idx]
	static llvm::Value *emitListToGlobalCopyFunction(
	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
	QualType ReductionArrayTy, SourceLocation Loc,
	const RecordDecl *TeamReductionRec,
	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&VarFieldMap) {
	ASTContext &C = CGM.getContext();

	// Buffer: global reduction buffer.
	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	// Idx: index of the buffer.
	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
	ImplicitParamDecl::Other);
	// ReduceList: thread local Reduce list.
	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	FunctionArgList Args;
	Args.push_back(&BufferArg);
	Args.push_back(&IdxArg);
	Args.push_back(&ReduceListArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	"_omp_reduction_list_to_global_copy_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setDoesNotRecurse();
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

	CGBuilderTy &Bld = CGF.Builder;

	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
	Address LocalReduceList(
	Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
	C.VoidPtrTy, Loc),
	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
	CGF.getPointerAlign());
	QualType StaticTy = C.getRecordType(TeamReductionRec);
	llvm::Type *LLVMReductionsBufferTy =
	CGM.getTypes().ConvertTypeForMem(StaticTy);
	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
	LLVMReductionsBufferTy->getPointerTo());
	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
	/Volatile=/false, C.IntTy,
	Loc)};
	unsigned Idx = 0;
	for (const Expr *Private : Privates) {
	// Reduce element = LocalReduceList[i]
	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
	// elemptr = ((CopyType*)(elemptrptr)) + I
	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
	Address ElemPtr =
	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
	// Global = Buffer.VD[Idx];
	const FieldDecl *FD = VarFieldMap.lookup(VD);
	LValue GlobLVal = CGF.EmitLValueForField(
	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
	llvm::Value *BufferPtr =
	Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
	GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
	switch (CGF.getEvaluationKind(Private->getType())) {
	case TEK_Scalar: {
	llvm::Value V = CGF.EmitLoadOfScalar(ElemPtr, /Volatile=*/false,
	Private->getType(), Loc);
	CGF.EmitStoreOfScalar(V, GlobLVal);
	break;
	}
	case TEK_Complex: {
	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
	CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
	CGF.EmitStoreOfComplex(V, GlobLVal, /isInit=/false);
	break;
	}
	case TEK_Aggregate:
	CGF.EmitAggregateCopy(GlobLVal,
	CGF.MakeAddrLValue(ElemPtr, Private->getType()),
	Private->getType(), AggValueSlot::DoesNotOverlap);
	break;
	}
	++Idx;
	}

	CGF.FinishFunction();
	return Fn;
	}

	/// This function emits a helper that reduces all the reduction variables from
	/// the team into the provided global buffer for the reduction variables.
	///
	/// void list_to_global_reduce_func(void buffer, int Idx, void reduce_data)
	/// void *GlobPtrs[];
	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
	/// ...
	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
	/// reduce_function(GlobPtrs, reduce_data);
	static llvm::Value *emitListToGlobalReduceFunction(
	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
	QualType ReductionArrayTy, SourceLocation Loc,
	const RecordDecl *TeamReductionRec,
	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&VarFieldMap,
	llvm::Function *ReduceFn) {
	ASTContext &C = CGM.getContext();

	// Buffer: global reduction buffer.
	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	// Idx: index of the buffer.
	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
	ImplicitParamDecl::Other);
	// ReduceList: thread local Reduce list.
	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	FunctionArgList Args;
	Args.push_back(&BufferArg);
	Args.push_back(&IdxArg);
	Args.push_back(&ReduceListArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	"_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setDoesNotRecurse();
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

	CGBuilderTy &Bld = CGF.Builder;

	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
	QualType StaticTy = C.getRecordType(TeamReductionRec);
	llvm::Type *LLVMReductionsBufferTy =
	CGM.getTypes().ConvertTypeForMem(StaticTy);
	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
	LLVMReductionsBufferTy->getPointerTo());

	// 1. Build a list of reduction variables.
	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
	Address ReductionList =
	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
	auto IPriv = Privates.begin();
	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
	/Volatile=/false, C.IntTy,
	Loc)};
	unsigned Idx = 0;
	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
	// Global = Buffer.VD[Idx];
	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
	const FieldDecl *FD = VarFieldMap.lookup(VD);
	LValue GlobLVal = CGF.EmitLValueForField(
	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
	llvm::Value *BufferPtr =
	Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
	if ((*IPriv)->getType()->isVariablyModifiedType()) {
	// Store array size.
	++Idx;
	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
	llvm::Value *Size = CGF.Builder.CreateIntCast(
	CGF.getVLASize(
	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
	.NumElts,
	CGF.SizeTy, /isSigned=/false);
	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
	Elem);
	}
	}

	// Call reduce_function(GlobalReduceList, ReduceList)
	llvm::Value *GlobalReduceList =
	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
	CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
	CGF.FinishFunction();
	return Fn;
	}

	/// This function emits a helper that copies all the reduction variables from
	/// the team into the provided global buffer for the reduction variables.
	///
	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
	/// For all data entries D in reduce_data:
	/// Copy buffer.D[Idx] to local D;
	static llvm::Value *emitGlobalToListCopyFunction(
	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
	QualType ReductionArrayTy, SourceLocation Loc,
	const RecordDecl *TeamReductionRec,
	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&VarFieldMap) {
	ASTContext &C = CGM.getContext();

	// Buffer: global reduction buffer.
	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	// Idx: index of the buffer.
	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
	ImplicitParamDecl::Other);
	// ReduceList: thread local Reduce list.
	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	FunctionArgList Args;
	Args.push_back(&BufferArg);
	Args.push_back(&IdxArg);
	Args.push_back(&ReduceListArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	"_omp_reduction_global_to_list_copy_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setDoesNotRecurse();
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

	CGBuilderTy &Bld = CGF.Builder;

	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
	Address LocalReduceList(
	Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
	C.VoidPtrTy, Loc),
	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
	CGF.getPointerAlign());
	QualType StaticTy = C.getRecordType(TeamReductionRec);
	llvm::Type *LLVMReductionsBufferTy =
	CGM.getTypes().ConvertTypeForMem(StaticTy);
	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
	LLVMReductionsBufferTy->getPointerTo());

	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
	/Volatile=/false, C.IntTy,
	Loc)};
	unsigned Idx = 0;
	for (const Expr *Private : Privates) {
	// Reduce element = LocalReduceList[i]
	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
	// elemptr = ((CopyType*)(elemptrptr)) + I
	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
	Address ElemPtr =
	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
	// Global = Buffer.VD[Idx];
	const FieldDecl *FD = VarFieldMap.lookup(VD);
	LValue GlobLVal = CGF.EmitLValueForField(
	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
	llvm::Value *BufferPtr =
	Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
	GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
	switch (CGF.getEvaluationKind(Private->getType())) {
	case TEK_Scalar: {
	llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
	CGF.EmitStoreOfScalar(V, ElemPtr, /Volatile=/false, Private->getType());
	break;
	}
	case TEK_Complex: {
	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
	CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
	/isInit=/false);
	break;
	}
	case TEK_Aggregate:
	CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
	GlobLVal, Private->getType(),
	AggValueSlot::DoesNotOverlap);
	break;
	}
	++Idx;
	}

	CGF.FinishFunction();
	return Fn;
	}

	/// This function emits a helper that reduces all the reduction variables from
	/// the team into the provided global buffer for the reduction variables.
	///
	/// void global_to_list_reduce_func(void buffer, int Idx, void reduce_data)
	/// void *GlobPtrs[];
	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
	/// ...
	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
	/// reduce_function(reduce_data, GlobPtrs);
	static llvm::Value *emitGlobalToListReduceFunction(
	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
	QualType ReductionArrayTy, SourceLocation Loc,
	const RecordDecl *TeamReductionRec,
	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
	&VarFieldMap,
	llvm::Function *ReduceFn) {
	ASTContext &C = CGM.getContext();

	// Buffer: global reduction buffer.
	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	// Idx: index of the buffer.
	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
	ImplicitParamDecl::Other);
	// ReduceList: thread local Reduce list.
	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.VoidPtrTy, ImplicitParamDecl::Other);
	FunctionArgList Args;
	Args.push_back(&BufferArg);
	Args.push_back(&IdxArg);
	Args.push_back(&ReduceListArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	"_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setDoesNotRecurse();
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);

	CGBuilderTy &Bld = CGF.Builder;

	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
	QualType StaticTy = C.getRecordType(TeamReductionRec);
	llvm::Type *LLVMReductionsBufferTy =
	CGM.getTypes().ConvertTypeForMem(StaticTy);
	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
	LLVMReductionsBufferTy->getPointerTo());

	// 1. Build a list of reduction variables.
	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
	Address ReductionList =
	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
	auto IPriv = Privates.begin();
	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
	/Volatile=/false, C.IntTy,
	Loc)};
	unsigned Idx = 0;
	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
	// Global = Buffer.VD[Idx];
	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
	const FieldDecl *FD = VarFieldMap.lookup(VD);
	LValue GlobLVal = CGF.EmitLValueForField(
	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
	llvm::Value *BufferPtr =
	Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
	if ((*IPriv)->getType()->isVariablyModifiedType()) {
	// Store array size.
	++Idx;
	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
	llvm::Value *Size = CGF.Builder.CreateIntCast(
	CGF.getVLASize(
	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
	.NumElts,
	CGF.SizeTy, /isSigned=/false);
	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
	Elem);
	}
	}

	// Call reduce_function(ReduceList, GlobalReduceList)
	llvm::Value *GlobalReduceList =
	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
	CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
	CGF.FinishFunction();
	return Fn;
	}

	///
	/// Design of OpenMP reductions on the GPU
	///
	/// Consider a typical OpenMP program with one or more reduction
	/// clauses:
	///
	/// float foo;
	/// double bar;
	/// #pragma omp target teams distribute parallel for \
	/// reduction(+:foo) reduction(*:bar)
	/// for (int i = 0; i < N; i++) {
	/// foo += A[i]; bar *= B[i];
	/// }
	///
	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
	/// all teams. In our OpenMP implementation on the NVPTX device an
	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
	/// within a team are mapped to CUDA threads within a threadblock.
	/// Our goal is to efficiently aggregate values across all OpenMP
	/// threads such that:
	///
	/// - the compiler and runtime are logically concise, and
	/// - the reduction is performed efficiently in a hierarchical
	/// manner as follows: within OpenMP threads in the same warp,
	/// across warps in a threadblock, and finally across teams on
	/// the NVPTX device.
	///
	/// Introduction to Decoupling
	///
	/// We would like to decouple the compiler and the runtime so that the
	/// latter is ignorant of the reduction variables (number, data types)
	/// and the reduction operators. This allows a simpler interface
	/// and implementation while still attaining good performance.
	///
	/// Pseudocode for the aforementioned OpenMP program generated by the
	/// compiler is as follows:
	///
	/// 1. Create private copies of reduction variables on each OpenMP
	/// thread: 'foo_private', 'bar_private'
	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
	/// to it and writes the result in 'foo_private' and 'bar_private'
	/// respectively.
	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
	/// and store the result on the team master:
	///
	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
	/// reduceData, shuffleReduceFn, interWarpCpyFn)
	///
	/// where:
	/// struct ReduceData {
	/// double *foo;
	/// double *bar;
	/// } reduceData
	/// reduceData.foo = &foo_private
	/// reduceData.bar = &bar_private
	///
	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
	/// auxiliary functions generated by the compiler that operate on
	/// variables of type 'ReduceData'. They aid the runtime perform
	/// algorithmic steps in a data agnostic manner.
	///
	/// 'shuffleReduceFn' is a pointer to a function that reduces data
	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
	/// same warp. It takes the following arguments as input:
	///
	/// a. variable of type 'ReduceData' on the calling lane,
	/// b. its lane_id,
	/// c. an offset relative to the current lane_id to generate a
	/// remote_lane_id. The remote lane contains the second
	/// variable of type 'ReduceData' that is to be reduced.
	/// d. an algorithm version parameter determining which reduction
	/// algorithm to use.
	///
	/// 'shuffleReduceFn' retrieves data from the remote lane using
	/// efficient GPU shuffle intrinsics and reduces, using the
	/// algorithm specified by the 4th parameter, the two operands
	/// element-wise. The result is written to the first operand.
	///
	/// Different reduction algorithms are implemented in different
	/// runtime functions, all calling 'shuffleReduceFn' to perform
	/// the essential reduction step. Therefore, based on the 4th
	/// parameter, this function behaves slightly differently to
	/// cooperate with the runtime to ensure correctness under
	/// different circumstances.
	///
	/// 'InterWarpCpyFn' is a pointer to a function that transfers
	/// reduced variables across warps. It tunnels, through CUDA
	/// shared memory, the thread-private data of type 'ReduceData'
	/// from lane 0 of each warp to a lane in the first warp.
	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
	/// The last team writes the global reduced value to memory.
	///
	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
	/// reduceData, shuffleReduceFn, interWarpCpyFn,
	/// scratchpadCopyFn, loadAndReduceFn)
	///
	/// 'scratchpadCopyFn' is a helper that stores reduced
	/// data from the team master to a scratchpad array in
	/// global memory.
	///
	/// 'loadAndReduceFn' is a helper that loads data from
	/// the scratchpad array and reduces it with the input
	/// operand.
	///
	/// These compiler generated functions hide address
	/// calculation and alignment information from the runtime.
	/// 5. if ret == 1:
	/// The team master of the last team stores the reduced
	/// result to the globals in memory.
	/// foo += reduceData.foo; bar *= reduceData.bar
	///
	///
	/// Warp Reduction Algorithms
	///
	/// On the warp level, we have three algorithms implemented in the
	/// OpenMP runtime depending on the number of active lanes:
	///
	/// Full Warp Reduction
	///
	/// The reduce algorithm within a warp where all lanes are active
	/// is implemented in the runtime as follows:
	///
	/// full_warp_reduce(void *reduce_data,
	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
	/// }
	///
	/// The algorithm completes in log(2, WARPSIZE) steps.
	///
	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
	/// not used therefore we save instructions by not retrieving lane_id
	/// from the corresponding special registers. The 4th parameter, which
	/// represents the version of the algorithm being used, is set to 0 to
	/// signify full warp reduction.
	///
	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
	///
	/// #reduce_elem refers to an element in the local lane's data structure
	/// #remote_elem is retrieved from a remote lane
	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
	///
	/// Contiguous Partial Warp Reduction
	///
	/// This reduce algorithm is used within a warp where only the first
	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
	/// number of OpenMP threads in a parallel region is not a multiple of
	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
	///
	/// void
	/// contiguous_partial_reduce(void *reduce_data,
	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
	/// int size, int lane_id) {
	/// int curr_size;
	/// int offset;
	/// curr_size = size;
	/// mask = curr_size/2;
	/// while (offset>0) {
	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
	/// curr_size = (curr_size+1)/2;
	/// offset = curr_size/2;
	/// }
	/// }
	///
	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
	///
	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
	/// if (lane_id < offset)
	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
	/// else
	/// reduce_elem = remote_elem
	///
	/// This algorithm assumes that the data to be reduced are located in a
	/// contiguous subset of lanes starting from the first. When there is
	/// an odd number of active lanes, the data in the last lane is not
	/// aggregated with any other lane's dat but is instead copied over.
	///
	/// Dispersed Partial Warp Reduction
	///
	/// This algorithm is used within a warp when any discontiguous subset of
	/// lanes are active. It is used to implement the reduction operation
	/// across lanes in an OpenMP simd region or in a nested parallel region.
	///
	/// void
	/// dispersed_partial_reduce(void *reduce_data,
	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
	/// int size, remote_id;
	/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
	/// do {
	/// remote_id = next_active_lane_id_right_after_me();
	/// # the above function returns 0 of no active lane
	/// # is present right after the current lane.
	/// size = number_of_active_lanes_in_this_warp();
	/// logical_lane_id /= 2;
	/// ShuffleReduceFn(reduce_data, logical_lane_id,
	/// remote_id-1-threadIdx.x, 2);
	/// } while (logical_lane_id % 2 == 0 && size > 1);
	/// }
	///
	/// There is no assumption made about the initial state of the reduction.
	/// Any number of lanes (>=1) could be active at any position. The reduction
	/// result is returned in the first active lane.
	///
	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
	///
	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
	/// if (lane_id % 2 == 0 && offset > 0)
	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
	/// else
	/// reduce_elem = remote_elem
	///
	///
	/// Intra-Team Reduction
	///
	/// This function, as implemented in the runtime call
	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
	/// threads in a team. It first reduces within a warp using the
	/// aforementioned algorithms. We then proceed to gather all such
	/// reduced values at the first warp.
	///
	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
	/// data from each of the "warp master" (zeroth lane of each warp, where
	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
	/// a mathematical sense) the problem of reduction across warp masters in
	/// a block to the problem of warp reduction.
	///
	///
	/// Inter-Team Reduction
	///
	/// Once a team has reduced its data to a single value, it is stored in
	/// a global scratchpad array. Since each team has a distinct slot, this
	/// can be done without locking.
	///
	/// The last team to write to the scratchpad array proceeds to reduce the
	/// scratchpad array. One or more workers in the last team use the helper
	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
	/// the k'th worker reduces every k'th element.
	///
	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
	/// reduce across workers and compute a globally reduced value.
	///
	void CGOpenMPRuntimeNVPTX::emitReduction(
	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
	ArrayRef<const Expr > LHSExprs, ArrayRef<const Expr > RHSExprs,
	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
	if (!CGF.HaveInsertPoint())
	return;

	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
	#ifndef NDEBUG
	bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
	#endif

	if (Options.SimpleReduction) {
	assert(!TeamsReduction && !ParallelReduction &&
	"Invalid reduction selection in emitReduction.");
	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
	ReductionOps, Options);
	return;
	}

	assert((TeamsReduction \|\| ParallelReduction) &&
	"Invalid reduction selection in emitReduction.");

	// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
	// RedList, shuffle_reduce_func, interwarp_copy_func);
	// or
	// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *ThreadId = getThreadID(CGF, Loc);

	llvm::Value *Res;
	ASTContext &C = CGM.getContext();
	// 1. Build a list of reduction variables.
	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
	auto Size = RHSExprs.size();
	for (const Expr *E : Privates) {
	if (E->getType()->isVariablyModifiedType())
	// Reserve place for array size.
	++Size;
	}
	llvm::APInt ArraySize(/unsigned int numBits=/32, Size);
	QualType ReductionArrayTy =
	C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
	/IndexTypeQuals=/0);
	Address ReductionList =
	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
	auto IPriv = Privates.begin();
	unsigned Idx = 0;
	for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
	CGF.Builder.CreateStore(
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
	Elem);
	if ((*IPriv)->getType()->isVariablyModifiedType()) {
	// Store array size.
	++Idx;
	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
	llvm::Value *Size = CGF.Builder.CreateIntCast(
	CGF.getVLASize(
	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
	.NumElts,
	CGF.SizeTy, /isSigned=/false);
	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
	Elem);
	}
	}

	llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	ReductionList.getPointer(), CGF.VoidPtrTy);
	llvm::Function *ReductionFn = emitReductionFunction(
	Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
	LHSExprs, RHSExprs, ReductionOps);
	llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
	llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
	CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
	llvm::Value *InterWarpCopyFn =
	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);

	if (ParallelReduction) {
	llvm::Value *Args[] = {RTLoc,
	ThreadId,
	CGF.Builder.getInt32(RHSExprs.size()),
	ReductionArrayTySize,
	RL,
	ShuffleAndReduceFn,
	InterWarpCopyFn};

	Res = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
	Args);
	} else {
	assert(TeamsReduction && "expected teams reduction.");
	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > VarFieldMap;
	llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
	int Cnt = 0;
	for (const Expr *DRE : Privates) {
	PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
	++Cnt;
	}
	const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
	CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
	C.getLangOpts().OpenMPCUDAReductionBufNum);
	TeamsReductions.push_back(TeamReductionRec);
	if (!KernelTeamsReductionPtr) {
	KernelTeamsReductionPtr = new llvm::GlobalVariable(
	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/true,
	llvm::GlobalValue::InternalLinkage, nullptr,
	"_openmp_teams_reductions_buffer_$_$ptr");
	}
	llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
	Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
	/Volatile=/false, C.getPointerType(C.VoidPtrTy), Loc);
	llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
	llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
	ReductionFn);
	llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
	llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
	ReductionFn);

	llvm::Value *Args[] = {
	RTLoc,
	ThreadId,
	GlobalBufferPtr,
	CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
	RL,
	ShuffleAndReduceFn,
	InterWarpCopyFn,
	GlobalToBufferCpyFn,
	GlobalToBufferRedFn,
	BufferToGlobalCpyFn,
	BufferToGlobalRedFn};

	Res = CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(
	OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
	Args);
	}

	// 5. Build if (res == 1)
	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
	llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
	Res, llvm::ConstantInt::get(CGM.Int32Ty, /V=/1));
	CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);

	// 6. Build then branch: where we have reduced values in the master
	// thread in each team.
	// __kmpc_end_reduce{_nowait}(<gtid>);
	// break;
	CGF.EmitBlock(ThenBB);

	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
	auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
	auto IPriv = Privates.begin();
	auto ILHS = LHSExprs.begin();
	auto IRHS = RHSExprs.begin();
	for (const Expr *E : ReductionOps) {
	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
	cast<DeclRefExpr>(*IRHS));
	++IPriv;
	++ILHS;
	++IRHS;
	}
	};
	llvm::Value *EndArgs[] = {ThreadId};
	RegionCodeGenTy RCG(CodeGen);
	NVPTXActionTy Action(
	nullptr, llvm::None,
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
	EndArgs);
	RCG.setAction(Action);
	RCG(CGF);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(ExitBB, /IsFinished=/true);
	}

	const VarDecl *
	CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
	const VarDecl *NativeParam) const {
	if (!NativeParam->getType()->isReferenceType())
	return NativeParam;
	QualType ArgType = NativeParam->getType();
	QualifierCollector QC;
	const Type *NonQualTy = QC.strip(ArgType);
	QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
	if (Attr->getCaptureKind() == OMPC_map) {
	PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
	LangAS::opencl_global);
	} else if (Attr->getCaptureKind() == OMPC_firstprivate &&
	PointeeTy.isConstant(CGM.getContext())) {
	PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
	LangAS::opencl_generic);
	}
	}
	ArgType = CGM.getContext().getPointerType(PointeeTy);
	QC.addRestrict();
	enum { NVPTX_local_addr = 5 };
	QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
	ArgType = QC.apply(CGM.getContext(), ArgType);
	if (isa<ImplicitParamDecl>(NativeParam))
	return ImplicitParamDecl::Create(
	CGM.getContext(), /DC=/nullptr, NativeParam->getLocation(),
	NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
	return ParmVarDecl::Create(
	CGM.getContext(),
	const_cast<DeclContext *>(NativeParam->getDeclContext()),
	NativeParam->getBeginLoc(), NativeParam->getLocation(),
	NativeParam->getIdentifier(), ArgType,
	/TInfo=/nullptr, SC_None, /DefArg=/nullptr);
	}

	Address
	CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
	const VarDecl *NativeParam,
	const VarDecl *TargetParam) const {
	assert(NativeParam != TargetParam &&
	NativeParam->getType()->isReferenceType() &&
	"Native arg must not be the same as target arg.");
	Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
	QualType NativeParamType = NativeParam->getType();
	QualifierCollector QC;
	const Type *NonQualTy = QC.strip(NativeParamType);
	QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
	unsigned NativePointeeAddrSpace =
	CGF.getContext().getTargetAddressSpace(NativePointeeTy);
	QualType TargetTy = TargetParam->getType();
	llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
	LocalAddr, /Volatile=/false, TargetTy, SourceLocation());
	// First cast to generic.
	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
	/AddrSpace=/0));
	// Cast from generic to native address space.
	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
	NativePointeeAddrSpace));
	Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
	CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /Volatile=/false,
	NativeParamType);
	return NativeParamAddr;
	}

	void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
	ArrayRef<llvm::Value *> Args) const {
	SmallVector<llvm::Value *, 4> TargetArgs;
	TargetArgs.reserve(Args.size());
	auto *FnType = OutlinedFn.getFunctionType();
	for (unsigned I = 0, E = Args.size(); I < E; ++I) {
	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
	TargetArgs.append(std::next(Args.begin(), I), Args.end());
	break;
	}
	llvm::Type *TargetType = FnType->getParamType(I);
	llvm::Value *NativeArg = Args[I];
	if (!TargetType->isPointerTy()) {
	TargetArgs.emplace_back(NativeArg);
	continue;
	}
	llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	NativeArg,
	NativeArg->getType()->getPointerElementType()->getPointerTo());
	TargetArgs.emplace_back(
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
	}
	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
	}

	/// Emit function which wraps the outline parallel region
	/// and controls the arguments which are passed to this function.
	/// The wrapper ensures that the outlined function is called
	/// with the correct arguments when data is shared.
	llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
	llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
	ASTContext &Ctx = CGM.getContext();
	const auto &CS = *D.getCapturedStmt(OMPD_parallel);

	// Create a function that takes as argument the source thread.
	FunctionArgList WrapperArgs;
	QualType Int16QTy =
	Ctx.getIntTypeForBitwidth(/DestWidth=/16, /Signed=/false);
	QualType Int32QTy =
	Ctx.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/false);
	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
	/Id=/nullptr, Int16QTy,
	ImplicitParamDecl::Other);
	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
	/Id=/nullptr, Int32QTy,
	ImplicitParamDecl::Other);
	WrapperArgs.emplace_back(&ParallelLevelArg);
	WrapperArgs.emplace_back(&WrapperArg);

	const CGFunctionInfo &CGFI =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);

	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
	Fn->setDoesNotRecurse();

	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
	D.getBeginLoc(), D.getBeginLoc());

	const auto *RD = CS.getCapturedRecordDecl();
	auto CurField = RD->field_begin();

	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
	/Name=/".zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	// Get the array of arguments.
	SmallVector<llvm::Value *, 8> Args;

	Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
	Args.emplace_back(ZeroAddr.getPointer());

	CGBuilderTy &Bld = CGF.Builder;
	auto CI = CS.capture_begin();

	// Use global memory for data sharing.
	// Handle passing of global args to workers.
	Address GlobalArgs =
	CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
	CGF.EmitRuntimeCall(
	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
	DataSharingArgs);

	// Retrieve the shared variables from the list of references returned
	// by the runtime. Pass the variables to the outlined function.
	Address SharedArgListAddress = Address::invalid();
	if (CS.capture_size() > 0 \|\|
	isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
	SharedArgListAddress = CGF.EmitLoadOfPointer(
	GlobalArgs, CGF.getContext()
	.getPointerType(CGF.getContext().getPointerType(
	CGF.getContext().VoidPtrTy))
	.castAs<PointerType>());
	}
	unsigned Idx = 0;
	if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
	Src, CGF.SizeTy->getPointerTo());
	llvm::Value *LB = CGF.EmitLoadOfScalar(
	TypedAddress,
	/Volatile=/false,
	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
	cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
	Args.emplace_back(LB);
	++Idx;
	Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
	Src, CGF.SizeTy->getPointerTo());
	llvm::Value *UB = CGF.EmitLoadOfScalar(
	TypedAddress,
	/Volatile=/false,
	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
	cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
	Args.emplace_back(UB);
	++Idx;
	}
	if (CS.capture_size() > 0) {
	ASTContext &CGFContext = CGF.getContext();
	for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
	QualType ElemTy = CurField->getType();
	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
	Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
	llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
	/Volatile=/false,
	CGFContext.getPointerType(ElemTy),
	CI->getLocation());
	if (CI->capturesVariableByCopy() &&
	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
	Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
	CI->getLocation());
	}
	Args.emplace_back(Arg);
	}
	}

	emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
	CGF.FinishFunction();
	return Fn;
	}

	void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
	const Decl *D) {
	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
	return;

	assert(D && "Expected function or captured\|block decl.");
	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
	"Function is registered already.");
	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&
	"Team is set but not processed.");
	const Stmt *Body = nullptr;
	bool NeedToDelayGlobalization = false;
	if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
	Body = FD->getBody();
	} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
	Body = BD->getBody();
	} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
	Body = CD->getBody();
	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
	if (NeedToDelayGlobalization &&
	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
	return;
	}
	if (!Body)
	return;
	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
	VarChecker.Visit(Body);
	const RecordDecl *GlobalizedVarsRecord =
	VarChecker.getGlobalizedRecord(IsInTTDRegion);
	TeamAndReductions.first = nullptr;
	TeamAndReductions.second.clear();
	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
	VarChecker.getEscapedVariableLengthDecls();
	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
	return;
	auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
	I->getSecond().MappedParams =
	std::make_unique<CodeGenFunction::OMPMapVars>();
	I->getSecond().GlobalRecord = GlobalizedVarsRecord;
	I->getSecond().EscapedParameters.insert(
	VarChecker.getEscapedParameters().begin(),
	VarChecker.getEscapedParameters().end());
	I->getSecond().EscapedVariableLengthDecls.append(
	EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
	assert(VD->isCanonicalDecl() && "Expected canonical declaration");
	const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
	Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
	}
	if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
	CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
	VarChecker.Visit(Body);
	I->getSecond().SecondaryGlobalRecord =
	VarChecker.getGlobalizedRecord(/IsInTTDRegion=/true);
	I->getSecond().SecondaryLocalVarData.emplace();
	DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
	assert(VD->isCanonicalDecl() && "Expected canonical declaration");
	const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
	Data.insert(
	std::make_pair(VD, MappedVarData(FD, /IsInTTDRegion=/true)));
	}
	}
	if (!NeedToDelayGlobalization) {
	emitGenericVarsProlog(CGF, D->getBeginLoc(), /WithSPMDCheck=/true);
	struct GlobalizationScope final : EHScopeStack::Cleanup {
	GlobalizationScope() = default;

	void Emit(CodeGenFunction &CGF, Flags flags) override {
	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
	.emitGenericVarsEpilog(CGF, /WithSPMDCheck=/true);
	}
	};
	CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
	}
	}

	Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
	const VarDecl *VD) {
	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
	auto AS = LangAS::Default;
	switch (A->getAllocatorType()) {
	// Use the default allocator here as by default local vars are
	// threadlocal.
	case OMPAllocateDeclAttr::OMPNullMemAlloc:
	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
	// Follow the user decision - use default allocation.
	return Address::invalid();
	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
	// TODO: implement aupport for user-defined allocators.
	return Address::invalid();
	case OMPAllocateDeclAttr::OMPConstMemAlloc:
	AS = LangAS::cuda_constant;
	break;
	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
	AS = LangAS::cuda_shared;
	break;
	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
	break;
	}
	llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
	auto *GV = new llvm::GlobalVariable(
	CGM.getModule(), VarTy, /isConstant=/false,
	llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
	VD->getName(),
	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
	CGM.getContext().getTargetAddressSpace(AS));
	CharUnits Align = CGM.getContext().getDeclAlign(VD);
	GV->setAlignment(Align.getAsAlign());
	return Address(
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
	VD->getType().getAddressSpace()))),
	Align);
	}

	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
	return Address::invalid();

	VD = VD->getCanonicalDecl();
	auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
	if (I == FunctionGlobalizedDecls.end())
	return Address::invalid();
	auto VDI = I->getSecond().LocalVarData.find(VD);
	if (VDI != I->getSecond().LocalVarData.end())
	return VDI->second.PrivateAddr;
	if (VD->hasAttrs()) {
	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
	E(VD->attr_end());
	IT != E; ++IT) {
	auto VDI = I->getSecond().LocalVarData.find(
	cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
	->getCanonicalDecl());
	if (VDI != I->getSecond().LocalVarData.end())
	return VDI->second.PrivateAddr;
	}
	}

	return Address::invalid();
	}

	void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
	FunctionGlobalizedDecls.erase(CGF.CurFn);
	CGOpenMPRuntime::functionFinished(CGF);
	}

	void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
	CodeGenFunction &CGF, const OMPLoopDirective &S,
	OpenMPDistScheduleClauseKind &ScheduleKind,
	llvm::Value *&Chunk) const {
	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
	ScheduleKind = OMPC_DIST_SCHEDULE_static;
	Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
	S.getIterationVariable()->getType(), S.getBeginLoc());
	return;
	}
	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
	CGF, S, ScheduleKind, Chunk);
	}

	void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
	CodeGenFunction &CGF, const OMPLoopDirective &S,
	OpenMPScheduleClauseKind &ScheduleKind,
	const Expr *&ChunkExpr) const {
	ScheduleKind = OMPC_SCHEDULE_static;
	// Chunk size is 1 in this case.
	llvm::APInt ChunkSize(32, 1);
	ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
	SourceLocation());
	}

	void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
	" Expected target-based directive.");
	const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
	for (const CapturedStmt::Capture &C : CS->captures()) {
	// Capture variables captured by reference in lambdas for target-based
	// directives.
	if (!C.capturesVariable())
	continue;
	const VarDecl *VD = C.getCapturedVar();
	const auto *RD = VD->getType()
	.getCanonicalType()
	.getNonReferenceType()
	->getAsCXXRecordDecl();
	if (!RD \|\| !RD->isLambda())
	continue;
	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
	LValue VDLVal;
	if (VD->getType().getCanonicalType()->isReferenceType())
	VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
	else
	VDLVal = CGF.MakeAddrLValue(
	VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
	llvm::DenseMap<const VarDecl , FieldDecl > Captures;
	FieldDecl *ThisCapture = nullptr;
	RD->getCaptureFields(Captures, ThisCapture);
	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
	LValue ThisLVal =
	CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
	llvm::Value *CXXThis = CGF.LoadCXXThis();
	CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
	}
	for (const LambdaCapture &LC : RD->captures()) {
	if (LC.getCaptureKind() != LCK_ByRef)
	continue;
	const VarDecl *VD = LC.getCapturedVar();
	if (!CS->capturesVariable(VD))
	continue;
	auto It = Captures.find(VD);
	assert(It != Captures.end() && "Found lambda capture without field.");
	LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
	if (VD->getType().getCanonicalType()->isReferenceType())
	VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
	VD->getType().getCanonicalType())
	.getAddress(CGF);
	CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
	}
	}
	}

	unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const {
	return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
	}

	bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
	LangAS &AS) {
	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
	return false;
	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
	switch(A->getAllocatorType()) {
	case OMPAllocateDeclAttr::OMPNullMemAlloc:
	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
	// Not supported, fallback to the default mem space.
	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
	AS = LangAS::Default;
	return true;
	case OMPAllocateDeclAttr::OMPConstMemAlloc:
	AS = LangAS::cuda_constant;
	return true;
	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
	AS = LangAS::cuda_shared;
	return true;
	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
	llvm_unreachable("Expected predefined allocator for the variables with the "
	"static storage.");
	}
	return false;
	}

	// Get current CudaArch and ignore any unknown values
	static CudaArch getCudaArch(CodeGenModule &CGM) {
	if (!CGM.getTarget().hasFeature("ptx"))
	return CudaArch::UNKNOWN;
	- llvm::StringMap<bool> Features;
	- CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
	- CGM.getTarget().getTargetOpts().CPU,
	- CGM.getTarget().getTargetOpts().Features);
	- for (const auto &Feature : Features) {
	+ for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
	if (Feature.getValue()) {
	CudaArch Arch = StringToCudaArch(Feature.getKey());
	if (Arch != CudaArch::UNKNOWN)
	return Arch;
	}
	}
	return CudaArch::UNKNOWN;
	}

	/// Check to see if target architecture supports unified addressing which is
	/// a restriction for OpenMP requires clause "unified_shared_memory".
	void CGOpenMPRuntimeNVPTX::processRequiresDirective(
	const OMPRequiresDecl *D) {
	for (const OMPClause *Clause : D->clauselists()) {
	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
	CudaArch Arch = getCudaArch(CGM);
	switch (Arch) {
	case CudaArch::SM_20:
	case CudaArch::SM_21:
	case CudaArch::SM_30:
	case CudaArch::SM_32:
	case CudaArch::SM_35:
	case CudaArch::SM_37:
	case CudaArch::SM_50:
	case CudaArch::SM_52:
	case CudaArch::SM_53:
	case CudaArch::SM_60:
	case CudaArch::SM_61:
	case CudaArch::SM_62: {
	SmallString<256> Buffer;
	llvm::raw_svector_ostream Out(Buffer);
	Out << "Target architecture " << CudaArchToString(Arch)
	<< " does not support unified addressing";
	CGM.Error(Clause->getBeginLoc(), Out.str());
	return;
	}
	case CudaArch::SM_70:
	case CudaArch::SM_72:
	case CudaArch::SM_75:
	case CudaArch::SM_80:
	case CudaArch::GFX600:
	case CudaArch::GFX601:
	case CudaArch::GFX700:
	case CudaArch::GFX701:
	case CudaArch::GFX702:
	case CudaArch::GFX703:
	case CudaArch::GFX704:
	case CudaArch::GFX801:
	case CudaArch::GFX802:
	case CudaArch::GFX803:
	case CudaArch::GFX810:
	case CudaArch::GFX900:
	case CudaArch::GFX902:
	case CudaArch::GFX904:
	case CudaArch::GFX906:
	case CudaArch::GFX908:
	case CudaArch::GFX909:
	case CudaArch::GFX1010:
	case CudaArch::GFX1011:
	case CudaArch::GFX1012:
	case CudaArch::GFX1030:
	case CudaArch::UNKNOWN:
	break;
	case CudaArch::LAST:
	llvm_unreachable("Unexpected Cuda arch.");
	}
	}
	}
	CGOpenMPRuntime::processRequiresDirective(D);
	}

	/// Get number of SMs and number of blocks per SM.
	static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
	std::pair<unsigned, unsigned> Data;
	if (CGM.getLangOpts().OpenMPCUDANumSMs)
	Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
	if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
	Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
	if (Data.first && Data.second)
	return Data;
	switch (getCudaArch(CGM)) {
	case CudaArch::SM_20:
	case CudaArch::SM_21:
	case CudaArch::SM_30:
	case CudaArch::SM_32:
	case CudaArch::SM_35:
	case CudaArch::SM_37:
	case CudaArch::SM_50:
	case CudaArch::SM_52:
	case CudaArch::SM_53:
	return {16, 16};
	case CudaArch::SM_60:
	case CudaArch::SM_61:
	case CudaArch::SM_62:
	return {56, 32};
	case CudaArch::SM_70:
	case CudaArch::SM_72:
	case CudaArch::SM_75:
	case CudaArch::SM_80:
	return {84, 32};
	case CudaArch::GFX600:
	case CudaArch::GFX601:
	case CudaArch::GFX700:
	case CudaArch::GFX701:
	case CudaArch::GFX702:
	case CudaArch::GFX703:
	case CudaArch::GFX704:
	case CudaArch::GFX801:
	case CudaArch::GFX802:
	case CudaArch::GFX803:
	case CudaArch::GFX810:
	case CudaArch::GFX900:
	case CudaArch::GFX902:
	case CudaArch::GFX904:
	case CudaArch::GFX906:
	case CudaArch::GFX908:
	case CudaArch::GFX909:
	case CudaArch::GFX1010:
	case CudaArch::GFX1011:
	case CudaArch::GFX1012:
	case CudaArch::GFX1030:
	case CudaArch::UNKNOWN:
	break;
	case CudaArch::LAST:
	llvm_unreachable("Unexpected Cuda arch.");
	}
	llvm_unreachable("Unexpected NVPTX target without ptx feature.");
	}

	void CGOpenMPRuntimeNVPTX::clear() {
	if (!GlobalizedRecords.empty() &&
	!CGM.getLangOpts().OpenMPCUDATargetParallel) {
	ASTContext &C = CGM.getContext();
	llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
	llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
	RecordDecl *StaticRD = C.buildImplicitRecord(
	"_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
	StaticRD->startDefinition();
	RecordDecl *SharedStaticRD = C.buildImplicitRecord(
	"_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
	SharedStaticRD->startDefinition();
	for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
	if (Records.Records.empty())
	continue;
	unsigned Size = 0;
	unsigned RecAlignment = 0;
	for (const RecordDecl *RD : Records.Records) {
	QualType RDTy = C.getRecordType(RD);
	unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
	RecAlignment = std::max(RecAlignment, Alignment);
	unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
	Size =
	llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
	}
	Size = llvm::alignTo(Size, RecAlignment);
	llvm::APInt ArySize(/numBits=/64, Size);
	QualType SubTy = C.getConstantArrayType(
	C.CharTy, ArySize, nullptr, ArrayType::Normal, /IndexTypeQuals=/0);
	const bool UseSharedMemory = Size <= SharedMemorySize;
	auto *Field =
	FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
	SourceLocation(), SourceLocation(), nullptr, SubTy,
	C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
	/BW=/nullptr, /Mutable=/false,
	/InitStyle=/ICIS_NoInit);
	Field->setAccess(AS_public);
	if (UseSharedMemory) {
	SharedStaticRD->addDecl(Field);
	SharedRecs.push_back(&Records);
	} else {
	StaticRD->addDecl(Field);
	GlobalRecs.push_back(&Records);
	}
	Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
	Records.UseSharedMemory->setInitializer(
	llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
	}
	// Allocate SharedMemorySize buffer for the shared memory.
	// FIXME: nvlink does not handle weak linkage correctly (object with the
	// different size are reported as erroneous).
	// Restore this code as sson as nvlink is fixed.
	if (!SharedStaticRD->field_empty()) {
	llvm::APInt ArySize(/numBits=/64, SharedMemorySize);
	QualType SubTy = C.getConstantArrayType(
	C.CharTy, ArySize, nullptr, ArrayType::Normal, /IndexTypeQuals=/0);
	auto *Field = FieldDecl::Create(
	C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
	C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
	/BW=/nullptr, /Mutable=/false,
	/InitStyle=/ICIS_NoInit);
	Field->setAccess(AS_public);
	SharedStaticRD->addDecl(Field);
	}
	SharedStaticRD->completeDefinition();
	if (!SharedStaticRD->field_empty()) {
	QualType StaticTy = C.getRecordType(SharedStaticRD);
	llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
	auto *GV = new llvm::GlobalVariable(
	CGM.getModule(), LLVMStaticTy,
	/isConstant=/false, llvm::GlobalValue::CommonLinkage,
	llvm::Constant::getNullValue(LLVMStaticTy),
	"_openmp_shared_static_glob_rd_$_", /InsertBefore=/nullptr,
	llvm::GlobalValue::NotThreadLocal,
	C.getTargetAddressSpace(LangAS::cuda_shared));
	auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
	GV, CGM.VoidPtrTy);
	for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
	Rec->Buffer->replaceAllUsesWith(Replacement);
	Rec->Buffer->eraseFromParent();
	}
	}
	StaticRD->completeDefinition();
	if (!StaticRD->field_empty()) {
	QualType StaticTy = C.getRecordType(StaticRD);
	std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
	llvm::APInt Size1(32, SMsBlockPerSM.second);
	QualType Arr1Ty =
	C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
	/IndexTypeQuals=/0);
	llvm::APInt Size2(32, SMsBlockPerSM.first);
	QualType Arr2Ty =
	C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
	/IndexTypeQuals=/0);
	llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
	// FIXME: nvlink does not handle weak linkage correctly (object with the
	// different size are reported as erroneous).
	// Restore CommonLinkage as soon as nvlink is fixed.
	auto *GV = new llvm::GlobalVariable(
	CGM.getModule(), LLVMArr2Ty,
	/isConstant=/false, llvm::GlobalValue::InternalLinkage,
	llvm::Constant::getNullValue(LLVMArr2Ty),
	"_openmp_static_glob_rd_$_");
	auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
	GV, CGM.VoidPtrTy);
	for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
	Rec->Buffer->replaceAllUsesWith(Replacement);
	Rec->Buffer->eraseFromParent();
	}
	}
	}
	if (!TeamsReductions.empty()) {
	ASTContext &C = CGM.getContext();
	RecordDecl *StaticRD = C.buildImplicitRecord(
	"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
	StaticRD->startDefinition();
	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
	QualType RecTy = C.getRecordType(TeamReductionRec);
	auto *Field = FieldDecl::Create(
	C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
	C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
	/BW=/nullptr, /Mutable=/false,
	/InitStyle=/ICIS_NoInit);
	Field->setAccess(AS_public);
	StaticRD->addDecl(Field);
	}
	StaticRD->completeDefinition();
	QualType StaticTy = C.getRecordType(StaticRD);
	llvm::Type *LLVMReductionsBufferTy =
	CGM.getTypes().ConvertTypeForMem(StaticTy);
	// FIXME: nvlink does not handle weak linkage correctly (object with the
	// different size are reported as erroneous).
	// Restore CommonLinkage as soon as nvlink is fixed.
	auto *GV = new llvm::GlobalVariable(
	CGM.getModule(), LLVMReductionsBufferTy,
	/isConstant=/false, llvm::GlobalValue::InternalLinkage,
	llvm::Constant::getNullValue(LLVMReductionsBufferTy),
	"_openmp_teams_reductions_buffer_$_");
	KernelTeamsReductionPtr->setInitializer(
	llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
	CGM.VoidPtrTy));
	}
	CGOpenMPRuntime::clear();
	}
	diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
	index 487c50dfc466..dd4545d6c48f 100644
	--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
	+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
	@@ -1,481 +1,483 @@
	//===--- AArch64.cpp - AArch64 (not ARM) Helpers for Tools ------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64.h"
	#include "clang/Driver/Driver.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Options.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Support/TargetParser.h"
	#include "llvm/Support/Host.h"

	using namespace clang::driver;
	using namespace clang::driver::tools;
	using namespace clang;
	using namespace llvm::opt;

	/// \returns true if the given triple can determine the default CPU type even
	/// if -arch is not specified.
	static bool isCPUDeterminedByTriple(const llvm::Triple &Triple) {
	return Triple.isOSDarwin();
	}

	/// getAArch64TargetCPU - Get the (LLVM) name of the AArch64 cpu we are
	/// targeting. Set \p A to the Arg corresponding to the -mcpu argument if it is
	/// provided, or to nullptr otherwise.
	std::string aarch64::getAArch64TargetCPU(const ArgList &Args,
	const llvm::Triple &Triple, Arg *&A) {
	std::string CPU;
	// If we have -mcpu, use that.
	if ((A = Args.getLastArg(options::OPT_mcpu_EQ))) {
	StringRef Mcpu = A->getValue();
	CPU = Mcpu.split("+").first.lower();
	}

	// Handle CPU name is 'native'.
	if (CPU == "native")
	return std::string(llvm::sys::getHostCPUName());
	else if (CPU.size())
	return CPU;

	// Make sure we pick the appropriate Apple CPU if -arch is used or when
	// targetting a Darwin OS.
	if (Args.getLastArg(options::OPT_arch) \|\| Triple.isOSDarwin())
	return Triple.getArch() == llvm::Triple::aarch64_32 ? "apple-s4"
	: "apple-a7";

	return "generic";
	}

	// Decode AArch64 features from string like +[no]featureA+[no]featureB+...
	static bool DecodeAArch64Features(const Driver &D, StringRef text,
	std::vector<StringRef> &Features,
	llvm::AArch64::ArchKind ArchKind) {
	SmallVector<StringRef, 8> Split;
	text.split(Split, StringRef("+"), -1, false);

	for (StringRef Feature : Split) {
	StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature);
	if (!FeatureName.empty())
	Features.push_back(FeatureName);
	else if (Feature == "neon" \|\| Feature == "noneon")
	D.Diag(clang::diag::err_drv_no_neon_modifier);
	else
	return false;

	// +sve implies +f32mm if the base architecture is v8.6A
	// it isn't the case in general that sve implies both f64mm and f32mm
	if ((ArchKind == llvm::AArch64::ArchKind::ARMV8_6A) && Feature == "sve")
	Features.push_back("+f32mm");
	}
	return true;
	}

	// Check if the CPU name and feature modifiers in -mcpu are legal. If yes,
	// decode CPU and feature.
	static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu, StringRef &CPU,
	std::vector<StringRef> &Features) {
	std::pair<StringRef, StringRef> Split = Mcpu.split("+");
	CPU = Split.first;
	llvm::AArch64::ArchKind ArchKind = llvm::AArch64::ArchKind::ARMV8A;

	if (CPU == "native")
	CPU = llvm::sys::getHostCPUName();

	if (CPU == "generic") {
	Features.push_back("+neon");
	} else {
	ArchKind = llvm::AArch64::parseCPUArch(CPU);
	if (!llvm::AArch64::getArchFeatures(ArchKind, Features))
	return false;

	unsigned Extension = llvm::AArch64::getDefaultExtensions(CPU, ArchKind);
	if (!llvm::AArch64::getExtensionFeatures(Extension, Features))
	return false;
	}

	if (Split.second.size() &&
	!DecodeAArch64Features(D, Split.second, Features, ArchKind))
	return false;

	return true;
	}

	static bool
	getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	std::string MarchLowerCase = March.lower();
	std::pair<StringRef, StringRef> Split = StringRef(MarchLowerCase).split("+");

	llvm::AArch64::ArchKind ArchKind = llvm::AArch64::parseArch(Split.first);
	if (ArchKind == llvm::AArch64::ArchKind::INVALID \|\|
	!llvm::AArch64::getArchFeatures(ArchKind, Features) \|\|
	(Split.second.size() &&
	!DecodeAArch64Features(D, Split.second, Features, ArchKind)))
	return false;

	return true;
	}

	static bool
	getAArch64ArchFeaturesFromMcpu(const Driver &D, StringRef Mcpu,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	StringRef CPU;
	std::string McpuLowerCase = Mcpu.lower();
	if (!DecodeAArch64Mcpu(D, McpuLowerCase, CPU, Features))
	return false;

	return true;
	}

	static bool
	getAArch64MicroArchFeaturesFromMtune(const Driver &D, StringRef Mtune,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	std::string MtuneLowerCase = Mtune.lower();
	// Check CPU name is valid
	std::vector<StringRef> MtuneFeatures;
	StringRef Tune;
	if (!DecodeAArch64Mcpu(D, MtuneLowerCase, Tune, MtuneFeatures))
	return false;

	// Handle CPU name is 'native'.
	if (MtuneLowerCase == "native")
	MtuneLowerCase = std::string(llvm::sys::getHostCPUName());
	if (MtuneLowerCase == "cyclone" \|\|
	StringRef(MtuneLowerCase).startswith("apple")) {
	Features.push_back("+zcm");
	Features.push_back("+zcz");
	}
	return true;
	}

	static bool
	getAArch64MicroArchFeaturesFromMcpu(const Driver &D, StringRef Mcpu,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	StringRef CPU;
	std::vector<StringRef> DecodedFeature;
	std::string McpuLowerCase = Mcpu.lower();
	if (!DecodeAArch64Mcpu(D, McpuLowerCase, CPU, DecodedFeature))
	return false;

	return getAArch64MicroArchFeaturesFromMtune(D, CPU, Args, Features);
	}

	void aarch64::getAArch64TargetFeatures(const Driver &D,
	const llvm::Triple &Triple,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	Arg *A;
	bool success = true;
	// Enable NEON by default.
	Features.push_back("+neon");
	if ((A = Args.getLastArg(options::OPT_march_EQ)))
	success = getAArch64ArchFeaturesFromMarch(D, A->getValue(), Args, Features);
	else if ((A = Args.getLastArg(options::OPT_mcpu_EQ)))
	success = getAArch64ArchFeaturesFromMcpu(D, A->getValue(), Args, Features);
	else if (Args.hasArg(options::OPT_arch) \|\| isCPUDeterminedByTriple(Triple))
	success = getAArch64ArchFeaturesFromMcpu(
	D, getAArch64TargetCPU(Args, Triple, A), Args, Features);

	if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)))
	success =
	getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features);
	else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ)))
	success =
	getAArch64MicroArchFeaturesFromMcpu(D, A->getValue(), Args, Features);
	else if (success &&
	(Args.hasArg(options::OPT_arch) \|\| isCPUDeterminedByTriple(Triple)))
	success = getAArch64MicroArchFeaturesFromMcpu(
	D, getAArch64TargetCPU(Args, Triple, A), Args, Features);

	if (!success)
	D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args);

	if (Args.getLastArg(options::OPT_mgeneral_regs_only)) {
	Features.push_back("-fp-armv8");
	Features.push_back("-crypto");
	Features.push_back("-neon");
	}

	if (Arg *A = Args.getLastArg(options::OPT_mtp_mode_EQ)) {
	StringRef Mtp = A->getValue();
	if (Mtp == "el3")
	Features.push_back("+tpidr-el3");
	else if (Mtp == "el2")
	Features.push_back("+tpidr-el2");
	else if (Mtp == "el1")
	Features.push_back("+tpidr-el1");
	else if (Mtp != "el0")
	D.Diag(diag::err_drv_invalid_mtp) << A->getAsString(Args);
	}

	// Enable/disable straight line speculation hardening.
	if (Arg *A = Args.getLastArg(options::OPT_mharden_sls_EQ)) {
	StringRef Scope = A->getValue();
	bool EnableRetBr = false;
	bool EnableBlr = false;
	if (Scope != "none" && Scope != "all") {
	SmallVector<StringRef, 4> Opts;
	Scope.split(Opts, ",");
	for (auto Opt : Opts) {
	Opt = Opt.trim();
	if (Opt == "retbr") {
	EnableRetBr = true;
	continue;
	}
	if (Opt == "blr") {
	EnableBlr = true;
	continue;
	}
	D.Diag(diag::err_invalid_sls_hardening)
	<< Scope << A->getAsString(Args);
	break;
	}
	} else if (Scope == "all") {
	EnableRetBr = true;
	EnableBlr = true;
	}

	if (EnableRetBr)
	Features.push_back("+harden-sls-retbr");
	if (EnableBlr)
	Features.push_back("+harden-sls-blr");
	}

	// En/disable crc
	if (Arg *A = Args.getLastArg(options::OPT_mcrc, options::OPT_mnocrc)) {
	if (A->getOption().matches(options::OPT_mcrc))
	Features.push_back("+crc");
	else
	Features.push_back("-crc");
	}

	// Handle (arch-dependent) fp16fml/fullfp16 relationship.
	// FIXME: this fp16fml option handling will be reimplemented after the
	// TargetParser rewrite.
	const auto ItRNoFullFP16 = std::find(Features.rbegin(), Features.rend(), "-fullfp16");
	const auto ItRFP16FML = std::find(Features.rbegin(), Features.rend(), "+fp16fml");
	if (llvm::is_contained(Features, "+v8.4a")) {
	const auto ItRFullFP16 = std::find(Features.rbegin(), Features.rend(), "+fullfp16");
	if (ItRFullFP16 < ItRNoFullFP16 && ItRFullFP16 < ItRFP16FML) {
	// Only entangled feature that can be to the right of this +fullfp16 is -fp16fml.
	// Only append the +fp16fml if there is no -fp16fml after the +fullfp16.
	if (std::find(Features.rbegin(), ItRFullFP16, "-fp16fml") == ItRFullFP16)
	Features.push_back("+fp16fml");
	}
	else
	goto fp16_fml_fallthrough;
	} else {
	fp16_fml_fallthrough:
	// In both of these cases, putting the 'other' feature on the end of the vector will
	// result in the same effect as placing it immediately after the current feature.
	if (ItRNoFullFP16 < ItRFP16FML)
	Features.push_back("-fp16fml");
	else if (ItRNoFullFP16 > ItRFP16FML)
	Features.push_back("+fullfp16");
	}

	// FIXME: this needs reimplementation too after the TargetParser rewrite
	//
	// Context sensitive meaning of Crypto:
	// 1) For Arch >= ARMv8.4a: crypto = sm4 + sha3 + sha2 + aes
	// 2) For Arch <= ARMv8.3a: crypto = sha2 + aes
	const auto ItBegin = Features.begin();
	const auto ItEnd = Features.end();
	const auto ItRBegin = Features.rbegin();
	const auto ItREnd = Features.rend();
	const auto ItRCrypto = std::find(ItRBegin, ItREnd, "+crypto");
	const auto ItRNoCrypto = std::find(ItRBegin, ItREnd, "-crypto");
	const auto HasCrypto = ItRCrypto != ItREnd;
	const auto HasNoCrypto = ItRNoCrypto != ItREnd;
	const ptrdiff_t PosCrypto = ItRCrypto - ItRBegin;
	const ptrdiff_t PosNoCrypto = ItRNoCrypto - ItRBegin;

	bool NoCrypto = false;
	if (HasCrypto && HasNoCrypto) {
	if (PosNoCrypto < PosCrypto)
	NoCrypto = true;
	}

	if (std::find(ItBegin, ItEnd, "+v8.4a") != ItEnd) {
	if (HasCrypto && !NoCrypto) {
	// Check if we have NOT disabled an algorithm with something like:
	// +crypto, -algorithm
	// And if "-algorithm" does not occur, we enable that crypto algorithm.
	const bool HasSM4 = (std::find(ItBegin, ItEnd, "-sm4") == ItEnd);
	const bool HasSHA3 = (std::find(ItBegin, ItEnd, "-sha3") == ItEnd);
	const bool HasSHA2 = (std::find(ItBegin, ItEnd, "-sha2") == ItEnd);
	const bool HasAES = (std::find(ItBegin, ItEnd, "-aes") == ItEnd);
	if (HasSM4)
	Features.push_back("+sm4");
	if (HasSHA3)
	Features.push_back("+sha3");
	if (HasSHA2)
	Features.push_back("+sha2");
	if (HasAES)
	Features.push_back("+aes");
	} else if (HasNoCrypto) {
	// Check if we have NOT enabled a crypto algorithm with something like:
	// -crypto, +algorithm
	// And if "+algorithm" does not occur, we disable that crypto algorithm.
	const bool HasSM4 = (std::find(ItBegin, ItEnd, "+sm4") != ItEnd);
	const bool HasSHA3 = (std::find(ItBegin, ItEnd, "+sha3") != ItEnd);
	const bool HasSHA2 = (std::find(ItBegin, ItEnd, "+sha2") != ItEnd);
	const bool HasAES = (std::find(ItBegin, ItEnd, "+aes") != ItEnd);
	if (!HasSM4)
	Features.push_back("-sm4");
	if (!HasSHA3)
	Features.push_back("-sha3");
	if (!HasSHA2)
	Features.push_back("-sha2");
	if (!HasAES)
	Features.push_back("-aes");
	}
	} else {
	if (HasCrypto && !NoCrypto) {
	const bool HasSHA2 = (std::find(ItBegin, ItEnd, "-sha2") == ItEnd);
	const bool HasAES = (std::find(ItBegin, ItEnd, "-aes") == ItEnd);
	if (HasSHA2)
	Features.push_back("+sha2");
	if (HasAES)
	Features.push_back("+aes");
	} else if (HasNoCrypto) {
	const bool HasSHA2 = (std::find(ItBegin, ItEnd, "+sha2") != ItEnd);
	const bool HasAES = (std::find(ItBegin, ItEnd, "+aes") != ItEnd);
	const bool HasV82a = (std::find(ItBegin, ItEnd, "+v8.2a") != ItEnd);
	const bool HasV83a = (std::find(ItBegin, ItEnd, "+v8.3a") != ItEnd);
	const bool HasV84a = (std::find(ItBegin, ItEnd, "+v8.4a") != ItEnd);
	if (!HasSHA2)
	Features.push_back("-sha2");
	if (!HasAES)
	Features.push_back("-aes");
	if (HasV82a \|\| HasV83a \|\| HasV84a) {
	Features.push_back("-sm4");
	Features.push_back("-sha3");
	}
	}
	}

	auto V8_6Pos = llvm::find(Features, "+v8.6a");
	if (V8_6Pos != std::end(Features))
	V8_6Pos = Features.insert(std::next(V8_6Pos), {"+i8mm", "+bf16"});

	if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
	- options::OPT_munaligned_access))
	+ options::OPT_munaligned_access)) {
	if (A->getOption().matches(options::OPT_mno_unaligned_access))
	Features.push_back("+strict-align");
	+ } else if (Triple.isOSOpenBSD())
	+ Features.push_back("+strict-align");

	if (Args.hasArg(options::OPT_ffixed_x1))
	Features.push_back("+reserve-x1");

	if (Args.hasArg(options::OPT_ffixed_x2))
	Features.push_back("+reserve-x2");

	if (Args.hasArg(options::OPT_ffixed_x3))
	Features.push_back("+reserve-x3");

	if (Args.hasArg(options::OPT_ffixed_x4))
	Features.push_back("+reserve-x4");

	if (Args.hasArg(options::OPT_ffixed_x5))
	Features.push_back("+reserve-x5");

	if (Args.hasArg(options::OPT_ffixed_x6))
	Features.push_back("+reserve-x6");

	if (Args.hasArg(options::OPT_ffixed_x7))
	Features.push_back("+reserve-x7");

	if (Args.hasArg(options::OPT_ffixed_x9))
	Features.push_back("+reserve-x9");

	if (Args.hasArg(options::OPT_ffixed_x10))
	Features.push_back("+reserve-x10");

	if (Args.hasArg(options::OPT_ffixed_x11))
	Features.push_back("+reserve-x11");

	if (Args.hasArg(options::OPT_ffixed_x12))
	Features.push_back("+reserve-x12");

	if (Args.hasArg(options::OPT_ffixed_x13))
	Features.push_back("+reserve-x13");

	if (Args.hasArg(options::OPT_ffixed_x14))
	Features.push_back("+reserve-x14");

	if (Args.hasArg(options::OPT_ffixed_x15))
	Features.push_back("+reserve-x15");

	if (Args.hasArg(options::OPT_ffixed_x18))
	Features.push_back("+reserve-x18");

	if (Args.hasArg(options::OPT_ffixed_x20))
	Features.push_back("+reserve-x20");

	if (Args.hasArg(options::OPT_ffixed_x21))
	Features.push_back("+reserve-x21");

	if (Args.hasArg(options::OPT_ffixed_x22))
	Features.push_back("+reserve-x22");

	if (Args.hasArg(options::OPT_ffixed_x23))
	Features.push_back("+reserve-x23");

	if (Args.hasArg(options::OPT_ffixed_x24))
	Features.push_back("+reserve-x24");

	if (Args.hasArg(options::OPT_ffixed_x25))
	Features.push_back("+reserve-x25");

	if (Args.hasArg(options::OPT_ffixed_x26))
	Features.push_back("+reserve-x26");

	if (Args.hasArg(options::OPT_ffixed_x27))
	Features.push_back("+reserve-x27");

	if (Args.hasArg(options::OPT_ffixed_x28))
	Features.push_back("+reserve-x28");

	if (Args.hasArg(options::OPT_ffixed_x30))
	Features.push_back("+reserve-x30");

	if (Args.hasArg(options::OPT_fcall_saved_x8))
	Features.push_back("+call-saved-x8");

	if (Args.hasArg(options::OPT_fcall_saved_x9))
	Features.push_back("+call-saved-x9");

	if (Args.hasArg(options::OPT_fcall_saved_x10))
	Features.push_back("+call-saved-x10");

	if (Args.hasArg(options::OPT_fcall_saved_x11))
	Features.push_back("+call-saved-x11");

	if (Args.hasArg(options::OPT_fcall_saved_x12))
	Features.push_back("+call-saved-x12");

	if (Args.hasArg(options::OPT_fcall_saved_x13))
	Features.push_back("+call-saved-x13");

	if (Args.hasArg(options::OPT_fcall_saved_x14))
	Features.push_back("+call-saved-x14");

	if (Args.hasArg(options::OPT_fcall_saved_x15))
	Features.push_back("+call-saved-x15");

	if (Args.hasArg(options::OPT_fcall_saved_x18))
	Features.push_back("+call-saved-x18");

	if (Args.hasArg(options::OPT_mno_neg_immediates))
	Features.push_back("+no-neg-immediates");
	}
	diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp
	index 2cc44c09917f..6b82abec6f65 100644
	--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp
	+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/Arch/X86.cpp
	@@ -1,216 +1,216 @@
	//===--- X86.cpp - X86 Helpers for Tools ------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "ToolChains/CommonArgs.h"
	#include "clang/Driver/Driver.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Options.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Support/Host.h"

	using namespace clang::driver;
	using namespace clang::driver::tools;
	using namespace clang;
	using namespace llvm::opt;

	const char *x86::getX86TargetCPU(const ArgList &Args,
	const llvm::Triple &Triple) {
	if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
	if (StringRef(A->getValue()) != "native")
	return A->getValue();

	// FIXME: Reject attempts to use -march=native unless the target matches
	// the host.
	//
	// FIXME: We should also incorporate the detected target features for use
	// with -native.
	std::string CPU = std::string(llvm::sys::getHostCPUName());
	if (!CPU.empty() && CPU != "generic")
	return Args.MakeArgString(CPU);
	}

	if (const Arg *A = Args.getLastArgNoClaim(options::OPT__SLASH_arch)) {
	// Mapping built by looking at lib/Basic's X86TargetInfo::initFeatureMap().
	StringRef Arch = A->getValue();
	const char *CPU = nullptr;
	if (Triple.getArch() == llvm::Triple::x86) { // 32-bit-only /arch: flags.
	CPU = llvm::StringSwitch<const char *>(Arch)
	.Case("IA32", "i386")
	.Case("SSE", "pentium3")
	.Case("SSE2", "pentium4")
	.Default(nullptr);
	}
	if (CPU == nullptr) { // 32-bit and 64-bit /arch: flags.
	CPU = llvm::StringSwitch<const char *>(Arch)
	.Case("AVX", "sandybridge")
	.Case("AVX2", "haswell")
	.Case("AVX512F", "knl")
	.Case("AVX512", "skylake-avx512")
	.Default(nullptr);
	}
	if (CPU) {
	A->claim();
	return CPU;
	}
	}

	// Select the default CPU if none was given (or detection failed).

	if (!Triple.isX86())
	return nullptr; // This routine is only handling x86 targets.

	bool Is64Bit = Triple.getArch() == llvm::Triple::x86_64;

	// FIXME: Need target hooks.
	if (Triple.isOSDarwin()) {
	if (Triple.getArchName() == "x86_64h")
	return "core-avx2";
	// macosx10.12 drops support for all pre-Penryn Macs.
	// Simulators can still run on 10.11 though, like Xcode.
	if (Triple.isMacOSX() && !Triple.isOSVersionLT(10, 12))
	return "penryn";
	// The oldest x86_64 Macs have core2/Merom; the oldest x86 Macs have Yonah.
	return Is64Bit ? "core2" : "yonah";
	}

	// Set up default CPU name for PS4 compilers.
	if (Triple.isPS4CPU())
	return "btver2";

	// On Android use targets compatible with gcc
	if (Triple.isAndroid())
	return Is64Bit ? "x86-64" : "i686";

	// Everything else goes to x86-64 in 64-bit mode.
	if (Is64Bit)
	return "x86-64";

	switch (Triple.getOS()) {
	- case llvm::Triple::FreeBSD:
	- return "i686";
	case llvm::Triple::NetBSD:
	- case llvm::Triple::OpenBSD:
	return "i486";
	case llvm::Triple::Haiku:
	+ case llvm::Triple::OpenBSD:
	return "i586";
	+ case llvm::Triple::FreeBSD:
	+ return "i686";
	default:
	// Fallback to p4.
	return "pentium4";
	}
	}

	void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	// If -march=native, autodetect the feature list.
	if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
	if (StringRef(A->getValue()) == "native") {
	llvm::StringMap<bool> HostFeatures;
	if (llvm::sys::getHostCPUFeatures(HostFeatures))
	for (auto &F : HostFeatures)
	Features.push_back(
	Args.MakeArgString((F.second ? "+" : "-") + F.first()));
	}
	}

	if (Triple.getArchName() == "x86_64h") {
	// x86_64h implies quite a few of the more modern subtarget features
	// for Haswell class CPUs, but not all of them. Opt-out of a few.
	Features.push_back("-rdrnd");
	Features.push_back("-aes");
	Features.push_back("-pclmul");
	Features.push_back("-rtm");
	Features.push_back("-fsgsbase");
	}

	const llvm::Triple::ArchType ArchType = Triple.getArch();
	// Add features to be compatible with gcc for Android.
	if (Triple.isAndroid()) {
	if (ArchType == llvm::Triple::x86_64) {
	Features.push_back("+sse4.2");
	Features.push_back("+popcnt");
	Features.push_back("+cx16");
	} else
	Features.push_back("+ssse3");
	}

	// Translate the high level `-mretpoline` flag to the specific target feature
	// flags. We also detect if the user asked for retpoline external thunks but
	// failed to ask for retpolines themselves (through any of the different
	// flags). This is a bit hacky but keeps existing usages working. We should
	// consider deprecating this and instead warn if the user requests external
	// retpoline thunks and doesn't request some form of retpolines.
	auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
	if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
	options::OPT_mspeculative_load_hardening,
	options::OPT_mno_speculative_load_hardening)) {
	if (Args.hasFlag(options::OPT_mretpoline, options::OPT_mno_retpoline,
	false)) {
	Features.push_back("+retpoline-indirect-calls");
	Features.push_back("+retpoline-indirect-branches");
	SpectreOpt = options::OPT_mretpoline;
	} else if (Args.hasFlag(options::OPT_mspeculative_load_hardening,
	options::OPT_mno_speculative_load_hardening,
	false)) {
	// On x86, speculative load hardening relies on at least using retpolines
	// for indirect calls.
	Features.push_back("+retpoline-indirect-calls");
	SpectreOpt = options::OPT_mspeculative_load_hardening;
	}
	} else if (Args.hasFlag(options::OPT_mretpoline_external_thunk,
	options::OPT_mno_retpoline_external_thunk, false)) {
	// FIXME: Add a warning about failing to specify `-mretpoline` and
	// eventually switch to an error here.
	Features.push_back("+retpoline-indirect-calls");
	Features.push_back("+retpoline-indirect-branches");
	SpectreOpt = options::OPT_mretpoline_external_thunk;
	}

	auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
	if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening,
	false)) {
	Features.push_back("+lvi-load-hardening");
	Features.push_back("+lvi-cfi"); // load hardening implies CFI protection
	LVIOpt = options::OPT_mlvi_hardening;
	} else if (Args.hasFlag(options::OPT_mlvi_cfi, options::OPT_mno_lvi_cfi,
	false)) {
	Features.push_back("+lvi-cfi");
	LVIOpt = options::OPT_mlvi_cfi;
	}

	if (Args.hasFlag(options::OPT_m_seses, options::OPT_mno_seses, false)) {
	if (LVIOpt == options::OPT_mlvi_hardening)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< D.getOpts().getOptionName(options::OPT_mlvi_hardening)
	<< D.getOpts().getOptionName(options::OPT_m_seses);

	if (SpectreOpt != clang::driver::options::ID::OPT_INVALID)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< D.getOpts().getOptionName(SpectreOpt)
	<< D.getOpts().getOptionName(options::OPT_m_seses);

	Features.push_back("+seses");
	if (!Args.hasArg(options::OPT_mno_lvi_cfi)) {
	Features.push_back("+lvi-cfi");
	LVIOpt = options::OPT_mlvi_cfi;
	}
	}

	if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
	LVIOpt != clang::driver::options::ID::OPT_INVALID) {
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< D.getOpts().getOptionName(SpectreOpt)
	<< D.getOpts().getOptionName(LVIOpt);
	}

	// Now add any that the user explicitly requested on the command line,
	// which may override the defaults.
	handleTargetFeaturesGroup(Args, Features, options::OPT_m_x86_Features_Group);
	}
	diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp
	index 25fc837e803b..c77ae5a44a0e 100644
	--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp
	+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/Clang.cpp
	@@ -1,7170 +1,7170 @@
	//===-- Clang.cpp - Clang+LLVM ToolChain Implementations --------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Clang.h"
	#include "AMDGPU.h"
	#include "Arch/AArch64.h"
	#include "Arch/ARM.h"
	#include "Arch/Mips.h"
	#include "Arch/PPC.h"
	#include "Arch/RISCV.h"
	#include "Arch/Sparc.h"
	#include "Arch/SystemZ.h"
	#include "Arch/VE.h"
	#include "Arch/X86.h"
	#include "CommonArgs.h"
	#include "Hexagon.h"
	#include "InputInfo.h"
	#include "MSP430.h"
	#include "PS4CPU.h"
	#include "clang/Basic/CharInfo.h"
	#include "clang/Basic/CodeGenOptions.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/ObjCRuntime.h"
	#include "clang/Basic/Version.h"
	#include "clang/Driver/Distro.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Options.h"
	#include "clang/Driver/SanitizerArgs.h"
	#include "clang/Driver/XRayArgs.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Compression.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/Process.h"
	#include "llvm/Support/TargetParser.h"
	#include "llvm/Support/YAMLParser.h"

	#ifdef LLVM_ON_UNIX
	#include <unistd.h> // For getuid().
	#endif

	using namespace clang::driver;
	using namespace clang::driver::tools;
	using namespace clang;
	using namespace llvm::opt;

	static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
	if (Arg *A =
	Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC)) {
	if (!Args.hasArg(options::OPT_E) && !Args.hasArg(options::OPT__SLASH_P) &&
	!Args.hasArg(options::OPT__SLASH_EP) && !D.CCCIsCPP()) {
	D.Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< A->getBaseArg().getAsString(Args)
	<< (D.IsCLMode() ? "/E, /P or /EP" : "-E");
	}
	}
	}

	static void CheckCodeGenerationOptions(const Driver &D, const ArgList &Args) {
	// In gcc, only ARM checks this, but it seems reasonable to check universally.
	if (Args.hasArg(options::OPT_static))
	if (const Arg *A =
	Args.getLastArg(options::OPT_dynamic, options::OPT_mdynamic_no_pic))
	D.Diag(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args)
	<< "-static";
	}

	// Add backslashes to escape spaces and other backslashes.
	// This is used for the space-separated argument list specified with
	// the -dwarf-debug-flags option.
	static void EscapeSpacesAndBackslashes(const char *Arg,
	SmallVectorImpl<char> &Res) {
	for (; *Arg; ++Arg) {
	switch (*Arg) {
	default:
	break;
	case ' ':
	case '\\':
	Res.push_back('\\');
	break;
	}
	Res.push_back(*Arg);
	}
	}

	// Quote target names for inclusion in GNU Make dependency files.
	// Only the characters '$', '#', ' ', '\t' are quoted.
	static void QuoteTarget(StringRef Target, SmallVectorImpl<char> &Res) {
	for (unsigned i = 0, e = Target.size(); i != e; ++i) {
	switch (Target[i]) {
	case ' ':
	case '\t':
	// Escape the preceding backslashes
	for (int j = i - 1; j >= 0 && Target[j] == '\\'; --j)
	Res.push_back('\\');

	// Escape the space/tab
	Res.push_back('\\');
	break;
	case '$':
	Res.push_back('$');
	break;
	case '#':
	Res.push_back('\\');
	break;
	default:
	break;
	}

	Res.push_back(Target[i]);
	}
	}

	/// Apply \a Work on the current tool chain \a RegularToolChain and any other
	/// offloading tool chain that is associated with the current action \a JA.
	static void
	forAllAssociatedToolChains(Compilation &C, const JobAction &JA,
	const ToolChain &RegularToolChain,
	llvm::function_ref<void(const ToolChain &)> Work) {
	// Apply Work on the current/regular tool chain.
	Work(RegularToolChain);

	// Apply Work on all the offloading tool chains associated with the current
	// action.
	if (JA.isHostOffloading(Action::OFK_Cuda))
	Work(*C.getSingleOffloadToolChain<Action::OFK_Cuda>());
	else if (JA.isDeviceOffloading(Action::OFK_Cuda))
	Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
	else if (JA.isHostOffloading(Action::OFK_HIP))
	Work(*C.getSingleOffloadToolChain<Action::OFK_HIP>());
	else if (JA.isDeviceOffloading(Action::OFK_HIP))
	Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());

	if (JA.isHostOffloading(Action::OFK_OpenMP)) {
	auto TCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
	for (auto II = TCs.first, IE = TCs.second; II != IE; ++II)
	Work(*II->second);
	} else if (JA.isDeviceOffloading(Action::OFK_OpenMP))
	Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());

	//
	// TODO: Add support for other offloading programming models here.
	//
	}

	/// This is a helper function for validating the optional refinement step
	/// parameter in reciprocal argument strings. Return false if there is an error
	/// parsing the refinement step. Otherwise, return true and set the Position
	/// of the refinement step in the input string.
	static bool getRefinementStep(StringRef In, const Driver &D,
	const Arg &A, size_t &Position) {
	const char RefinementStepToken = ':';
	Position = In.find(RefinementStepToken);
	if (Position != StringRef::npos) {
	StringRef Option = A.getOption().getName();
	StringRef RefStep = In.substr(Position + 1);
	// Allow exactly one numeric character for the additional refinement
	// step parameter. This is reasonable for all currently-supported
	// operations and architectures because we would expect that a larger value
	// of refinement steps would cause the estimate "optimization" to
	// under-perform the native operation. Also, if the estimate does not
	// converge quickly, it probably will not ever converge, so further
	// refinement steps will not produce a better answer.
	if (RefStep.size() != 1) {
	D.Diag(diag::err_drv_invalid_value) << Option << RefStep;
	return false;
	}
	char RefStepChar = RefStep[0];
	if (RefStepChar < '0' \|\| RefStepChar > '9') {
	D.Diag(diag::err_drv_invalid_value) << Option << RefStep;
	return false;
	}
	}
	return true;
	}

	/// The -mrecip flag requires processing of many optional parameters.
	static void ParseMRecip(const Driver &D, const ArgList &Args,
	ArgStringList &OutStrings) {
	StringRef DisabledPrefixIn = "!";
	StringRef DisabledPrefixOut = "!";
	StringRef EnabledPrefixOut = "";
	StringRef Out = "-mrecip=";

	Arg *A = Args.getLastArg(options::OPT_mrecip, options::OPT_mrecip_EQ);
	if (!A)
	return;

	unsigned NumOptions = A->getNumValues();
	if (NumOptions == 0) {
	// No option is the same as "all".
	OutStrings.push_back(Args.MakeArgString(Out + "all"));
	return;
	}

	// Pass through "all", "none", or "default" with an optional refinement step.
	if (NumOptions == 1) {
	StringRef Val = A->getValue(0);
	size_t RefStepLoc;
	if (!getRefinementStep(Val, D, *A, RefStepLoc))
	return;
	StringRef ValBase = Val.slice(0, RefStepLoc);
	if (ValBase == "all" \|\| ValBase == "none" \|\| ValBase == "default") {
	OutStrings.push_back(Args.MakeArgString(Out + Val));
	return;
	}
	}

	// Each reciprocal type may be enabled or disabled individually.
	// Check each input value for validity, concatenate them all back together,
	// and pass through.

	llvm::StringMap<bool> OptionStrings;
	OptionStrings.insert(std::make_pair("divd", false));
	OptionStrings.insert(std::make_pair("divf", false));
	OptionStrings.insert(std::make_pair("vec-divd", false));
	OptionStrings.insert(std::make_pair("vec-divf", false));
	OptionStrings.insert(std::make_pair("sqrtd", false));
	OptionStrings.insert(std::make_pair("sqrtf", false));
	OptionStrings.insert(std::make_pair("vec-sqrtd", false));
	OptionStrings.insert(std::make_pair("vec-sqrtf", false));

	for (unsigned i = 0; i != NumOptions; ++i) {
	StringRef Val = A->getValue(i);

	bool IsDisabled = Val.startswith(DisabledPrefixIn);
	// Ignore the disablement token for string matching.
	if (IsDisabled)
	Val = Val.substr(1);

	size_t RefStep;
	if (!getRefinementStep(Val, D, *A, RefStep))
	return;

	StringRef ValBase = Val.slice(0, RefStep);
	llvm::StringMap<bool>::iterator OptionIter = OptionStrings.find(ValBase);
	if (OptionIter == OptionStrings.end()) {
	// Try again specifying float suffix.
	OptionIter = OptionStrings.find(ValBase.str() + 'f');
	if (OptionIter == OptionStrings.end()) {
	// The input name did not match any known option string.
	D.Diag(diag::err_drv_unknown_argument) << Val;
	return;
	}
	// The option was specified without a float or double suffix.
	// Make sure that the double entry was not already specified.
	// The float entry will be checked below.
	if (OptionStrings[ValBase.str() + 'd']) {
	D.Diag(diag::err_drv_invalid_value) << A->getOption().getName() << Val;
	return;
	}
	}

	if (OptionIter->second == true) {
	// Duplicate option specified.
	D.Diag(diag::err_drv_invalid_value) << A->getOption().getName() << Val;
	return;
	}

	// Mark the matched option as found. Do not allow duplicate specifiers.
	OptionIter->second = true;

	// If the precision was not specified, also mark the double entry as found.
	if (ValBase.back() != 'f' && ValBase.back() != 'd')
	OptionStrings[ValBase.str() + 'd'] = true;

	// Build the output string.
	StringRef Prefix = IsDisabled ? DisabledPrefixOut : EnabledPrefixOut;
	Out = Args.MakeArgString(Out + Prefix + Val);
	if (i != NumOptions - 1)
	Out = Args.MakeArgString(Out + ",");
	}

	OutStrings.push_back(Args.MakeArgString(Out));
	}

	/// The -mprefer-vector-width option accepts either a positive integer
	/// or the string "none".
	static void ParseMPreferVectorWidth(const Driver &D, const ArgList &Args,
	ArgStringList &CmdArgs) {
	Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
	if (!A)
	return;

	StringRef Value = A->getValue();
	if (Value == "none") {
	CmdArgs.push_back("-mprefer-vector-width=none");
	} else {
	unsigned Width;
	if (Value.getAsInteger(10, Width)) {
	D.Diag(diag::err_drv_invalid_value) << A->getOption().getName() << Value;
	return;
	}
	CmdArgs.push_back(Args.MakeArgString("-mprefer-vector-width=" + Value));
	}
	}

	static void getWebAssemblyTargetFeatures(const ArgList &Args,
	std::vector<StringRef> &Features) {
	handleTargetFeaturesGroup(Args, Features, options::OPT_m_wasm_Features_Group);
	}

	static void getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
	const ArgList &Args, ArgStringList &CmdArgs,
	bool ForAS, bool IsAux = false) {
	std::vector<StringRef> Features;
	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	mips::getMIPSTargetFeatures(D, Triple, Args, Features);
	break;

	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	arm::getARMTargetFeatures(D, Triple, Args, CmdArgs, Features, ForAS);
	break;

	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	ppc::getPPCTargetFeatures(D, Triple, Args, Features);
	break;
	case llvm::Triple::riscv32:
	case llvm::Triple::riscv64:
	riscv::getRISCVTargetFeatures(D, Triple, Args, Features);
	break;
	case llvm::Triple::systemz:
	systemz::getSystemZTargetFeatures(D, Args, Features);
	break;
	case llvm::Triple::aarch64:
	case llvm::Triple::aarch64_32:
	case llvm::Triple::aarch64_be:
	aarch64::getAArch64TargetFeatures(D, Triple, Args, Features);
	break;
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	x86::getX86TargetFeatures(D, Triple, Args, Features);
	break;
	case llvm::Triple::hexagon:
	hexagon::getHexagonTargetFeatures(D, Args, Features);
	break;
	case llvm::Triple::wasm32:
	case llvm::Triple::wasm64:
	getWebAssemblyTargetFeatures(Args, Features);
	break;
	case llvm::Triple::sparc:
	case llvm::Triple::sparcel:
	case llvm::Triple::sparcv9:
	sparc::getSparcTargetFeatures(D, Args, Features);
	break;
	case llvm::Triple::r600:
	case llvm::Triple::amdgcn:
	amdgpu::getAMDGPUTargetFeatures(D, Args, Features);
	break;
	case llvm::Triple::msp430:
	msp430::getMSP430TargetFeatures(D, Args, Features);
	break;
	case llvm::Triple::ve:
	ve::getVETargetFeatures(D, Args, Features);
	}

	for (auto Feature : unifyTargetFeatures(Features)) {
	CmdArgs.push_back(IsAux ? "-aux-target-feature" : "-target-feature");
	CmdArgs.push_back(Feature.data());
	}
	}

	static bool
	shouldUseExceptionTablesForObjCExceptions(const ObjCRuntime &runtime,
	const llvm::Triple &Triple) {
	// We use the zero-cost exception tables for Objective-C if the non-fragile
	// ABI is enabled or when compiling for x86_64 and ARM on Snow Leopard and
	// later.
	if (runtime.isNonFragile())
	return true;

	if (!Triple.isMacOSX())
	return false;

	return (!Triple.isMacOSXVersionLT(10, 5) &&
	(Triple.getArch() == llvm::Triple::x86_64 \|\|
	Triple.getArch() == llvm::Triple::arm));
	}

	/// Adds exception related arguments to the driver command arguments. There's a
	/// master flag, -fexceptions and also language specific flags to enable/disable
	/// C++ and Objective-C exceptions. This makes it possible to for example
	/// disable C++ exceptions but enable Objective-C exceptions.
	static void addExceptionArgs(const ArgList &Args, types::ID InputType,
	const ToolChain &TC, bool KernelOrKext,
	const ObjCRuntime &objcRuntime,
	ArgStringList &CmdArgs) {
	const llvm::Triple &Triple = TC.getTriple();

	if (KernelOrKext) {
	// -mkernel and -fapple-kext imply no exceptions, so claim exception related
	// arguments now to avoid warnings about unused arguments.
	Args.ClaimAllArgs(options::OPT_fexceptions);
	Args.ClaimAllArgs(options::OPT_fno_exceptions);
	Args.ClaimAllArgs(options::OPT_fobjc_exceptions);
	Args.ClaimAllArgs(options::OPT_fno_objc_exceptions);
	Args.ClaimAllArgs(options::OPT_fcxx_exceptions);
	Args.ClaimAllArgs(options::OPT_fno_cxx_exceptions);
	return;
	}

	// See if the user explicitly enabled exceptions.
	bool EH = Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions,
	false);

	// Obj-C exceptions are enabled by default, regardless of -fexceptions. This
	// is not necessarily sensible, but follows GCC.
	if (types::isObjC(InputType) &&
	Args.hasFlag(options::OPT_fobjc_exceptions,
	options::OPT_fno_objc_exceptions, true)) {
	CmdArgs.push_back("-fobjc-exceptions");

	EH \|= shouldUseExceptionTablesForObjCExceptions(objcRuntime, Triple);
	}

	if (types::isCXX(InputType)) {
	// Disable C++ EH by default on XCore and PS4.
	bool CXXExceptionsEnabled =
	Triple.getArch() != llvm::Triple::xcore && !Triple.isPS4CPU();
	Arg *ExceptionArg = Args.getLastArg(
	options::OPT_fcxx_exceptions, options::OPT_fno_cxx_exceptions,
	options::OPT_fexceptions, options::OPT_fno_exceptions);
	if (ExceptionArg)
	CXXExceptionsEnabled =
	ExceptionArg->getOption().matches(options::OPT_fcxx_exceptions) \|\|
	ExceptionArg->getOption().matches(options::OPT_fexceptions);

	if (CXXExceptionsEnabled) {
	CmdArgs.push_back("-fcxx-exceptions");

	EH = true;
	}
	}

	// OPT_fignore_exceptions means exception could still be thrown,
	// but no clean up or catch would happen in current module.
	// So we do not set EH to false.
	Args.AddLastArg(CmdArgs, options::OPT_fignore_exceptions);

	if (EH)
	CmdArgs.push_back("-fexceptions");
	}

	static bool ShouldEnableAutolink(const ArgList &Args, const ToolChain &TC,
	const JobAction &JA) {
	bool Default = true;
	if (TC.getTriple().isOSDarwin()) {
	// The native darwin assembler doesn't support the linker_option directives,
	// so we disable them if we think the .s file will be passed to it.
	Default = TC.useIntegratedAs();
	}
	// The linker_option directives are intended for host compilation.
	if (JA.isDeviceOffloading(Action::OFK_Cuda) \|\|
	JA.isDeviceOffloading(Action::OFK_HIP))
	Default = false;
	return Args.hasFlag(options::OPT_fautolink, options::OPT_fno_autolink,
	Default);
	}

	static bool ShouldDisableDwarfDirectory(const ArgList &Args,
	const ToolChain &TC) {
	bool UseDwarfDirectory =
	Args.hasFlag(options::OPT_fdwarf_directory_asm,
	options::OPT_fno_dwarf_directory_asm, TC.useIntegratedAs());
	return !UseDwarfDirectory;
	}

	// Convert an arg of the form "-gN" or "-ggdbN" or one of their aliases
	// to the corresponding DebugInfoKind.
	static codegenoptions::DebugInfoKind DebugLevelToInfoKind(const Arg &A) {
	assert(A.getOption().matches(options::OPT_gN_Group) &&
	"Not a -g option that specifies a debug-info level");
	if (A.getOption().matches(options::OPT_g0) \|\|
	A.getOption().matches(options::OPT_ggdb0))
	return codegenoptions::NoDebugInfo;
	if (A.getOption().matches(options::OPT_gline_tables_only) \|\|
	A.getOption().matches(options::OPT_ggdb1))
	return codegenoptions::DebugLineTablesOnly;
	if (A.getOption().matches(options::OPT_gline_directives_only))
	return codegenoptions::DebugDirectivesOnly;
	return codegenoptions::DebugInfoConstructor;
	}

	static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) {
	switch (Triple.getArch()){
	default:
	return false;
	case llvm::Triple::arm:
	case llvm::Triple::thumb:
	// ARM Darwin targets require a frame pointer to be always present to aid
	// offline debugging via backtraces.
	return Triple.isOSDarwin();
	}
	}

	static bool useFramePointerForTargetByDefault(const ArgList &Args,
	const llvm::Triple &Triple) {
	if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry))
	return true;

	switch (Triple.getArch()) {
	case llvm::Triple::xcore:
	case llvm::Triple::wasm32:
	case llvm::Triple::wasm64:
	case llvm::Triple::msp430:
	// XCore never wants frame pointers, regardless of OS.
	// WebAssembly never wants frame pointers.
	return false;
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	case llvm::Triple::riscv32:
	case llvm::Triple::riscv64:
	case llvm::Triple::amdgcn:
	case llvm::Triple::r600:
	return !areOptimizationsEnabled(Args);
	default:
	break;
	}

	if (Triple.isOSNetBSD()) {
	return !areOptimizationsEnabled(Args);
	}

	if (Triple.isOSLinux() \|\| Triple.getOS() == llvm::Triple::CloudABI \|\|
	Triple.isOSHurd()) {
	switch (Triple.getArch()) {
	// Don't use a frame pointer on linux if optimizing for certain targets.
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	if (Triple.isAndroid())
	return true;
	LLVM_FALLTHROUGH;
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::systemz:
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	return !areOptimizationsEnabled(Args);
	default:
	return true;
	}
	}

	if (Triple.isOSWindows()) {
	switch (Triple.getArch()) {
	case llvm::Triple::x86:
	return !areOptimizationsEnabled(Args);
	case llvm::Triple::x86_64:
	return Triple.isOSBinFormatMachO();
	case llvm::Triple::arm:
	case llvm::Triple::thumb:
	// Windows on ARM builds with FPO disabled to aid fast stack walking
	return true;
	default:
	// All other supported Windows ISAs use xdata unwind information, so frame
	// pointers are not generally useful.
	return false;
	}
	}

	return true;
	}

	static CodeGenOptions::FramePointerKind
	getFramePointerKind(const ArgList &Args, const llvm::Triple &Triple) {
	// We have 4 states:
	//
	// 00) leaf retained, non-leaf retained
	// 01) leaf retained, non-leaf omitted (this is invalid)
	// 10) leaf omitted, non-leaf retained
	// (what -momit-leaf-frame-pointer was designed for)
	// 11) leaf omitted, non-leaf omitted
	//
	// "omit" options taking precedence over "no-omit" options is the only way
	// to make 3 valid states representable
	Arg *A = Args.getLastArg(options::OPT_fomit_frame_pointer,
	options::OPT_fno_omit_frame_pointer);
	bool OmitFP = A && A->getOption().matches(options::OPT_fomit_frame_pointer);
	bool NoOmitFP =
	A && A->getOption().matches(options::OPT_fno_omit_frame_pointer);
	bool KeepLeaf = Args.hasFlag(options::OPT_momit_leaf_frame_pointer,
	options::OPT_mno_omit_leaf_frame_pointer,
	Triple.isAArch64() \|\| Triple.isPS4CPU());
	if (NoOmitFP \|\| mustUseNonLeafFramePointerForTarget(Triple) \|\|
	(!OmitFP && useFramePointerForTargetByDefault(Args, Triple))) {
	if (KeepLeaf)
	return CodeGenOptions::FramePointerKind::NonLeaf;
	return CodeGenOptions::FramePointerKind::All;
	}
	return CodeGenOptions::FramePointerKind::None;
	}

	/// Add a CC1 option to specify the debug compilation directory.
	static void addDebugCompDirArg(const ArgList &Args, ArgStringList &CmdArgs,
	const llvm::vfs::FileSystem &VFS) {
	if (Arg *A = Args.getLastArg(options::OPT_fdebug_compilation_dir)) {
	CmdArgs.push_back("-fdebug-compilation-dir");
	CmdArgs.push_back(A->getValue());
	} else if (llvm::ErrorOr<std::string> CWD =
	VFS.getCurrentWorkingDirectory()) {
	CmdArgs.push_back("-fdebug-compilation-dir");
	CmdArgs.push_back(Args.MakeArgString(*CWD));
	}
	}

	/// Add a CC1 and CC1AS option to specify the debug file path prefix map.
	static void addDebugPrefixMapArg(const Driver &D, const ArgList &Args, ArgStringList &CmdArgs) {
	for (const Arg *A : Args.filtered(options::OPT_ffile_prefix_map_EQ,
	options::OPT_fdebug_prefix_map_EQ)) {
	StringRef Map = A->getValue();
	if (Map.find('=') == StringRef::npos)
	D.Diag(diag::err_drv_invalid_argument_to_option)
	<< Map << A->getOption().getName();
	else
	CmdArgs.push_back(Args.MakeArgString("-fdebug-prefix-map=" + Map));
	A->claim();
	}
	}

	/// Add a CC1 and CC1AS option to specify the macro file path prefix map.
	static void addMacroPrefixMapArg(const Driver &D, const ArgList &Args,
	ArgStringList &CmdArgs) {
	for (const Arg *A : Args.filtered(options::OPT_ffile_prefix_map_EQ,
	options::OPT_fmacro_prefix_map_EQ)) {
	StringRef Map = A->getValue();
	if (Map.find('=') == StringRef::npos)
	D.Diag(diag::err_drv_invalid_argument_to_option)
	<< Map << A->getOption().getName();
	else
	CmdArgs.push_back(Args.MakeArgString("-fmacro-prefix-map=" + Map));
	A->claim();
	}
	}

	/// Vectorize at all optimization levels greater than 1 except for -Oz.
	/// For -Oz the loop vectorizer is disabled, while the slp vectorizer is
	/// enabled.
	static bool shouldEnableVectorizerAtOLevel(const ArgList &Args, bool isSlpVec) {
	if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
	if (A->getOption().matches(options::OPT_O4) \|\|
	A->getOption().matches(options::OPT_Ofast))
	return true;

	if (A->getOption().matches(options::OPT_O0))
	return false;

	assert(A->getOption().matches(options::OPT_O) && "Must have a -O flag");

	// Vectorize -Os.
	StringRef S(A->getValue());
	if (S == "s")
	return true;

	// Don't vectorize -Oz, unless it's the slp vectorizer.
	if (S == "z")
	return isSlpVec;

	unsigned OptLevel = 0;
	if (S.getAsInteger(10, OptLevel))
	return false;

	return OptLevel > 1;
	}

	return false;
	}

	/// Add -x lang to \p CmdArgs for \p Input.
	static void addDashXForInput(const ArgList &Args, const InputInfo &Input,
	ArgStringList &CmdArgs) {
	// When using -verify-pch, we don't want to provide the type
	// 'precompiled-header' if it was inferred from the file extension
	if (Args.hasArg(options::OPT_verify_pch) && Input.getType() == types::TY_PCH)
	return;

	CmdArgs.push_back("-x");
	if (Args.hasArg(options::OPT_rewrite_objc))
	CmdArgs.push_back(types::getTypeName(types::TY_PP_ObjCXX));
	else {
	// Map the driver type to the frontend type. This is mostly an identity
	// mapping, except that the distinction between module interface units
	// and other source files does not exist at the frontend layer.
	const char *ClangType;
	switch (Input.getType()) {
	case types::TY_CXXModule:
	ClangType = "c++";
	break;
	case types::TY_PP_CXXModule:
	ClangType = "c++-cpp-output";
	break;
	default:
	ClangType = types::getTypeName(Input.getType());
	break;
	}
	CmdArgs.push_back(ClangType);
	}
	}

	static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
	const Driver &D, const InputInfo &Output,
	const ArgList &Args,
	ArgStringList &CmdArgs) {

	auto *PGOGenerateArg = Args.getLastArg(options::OPT_fprofile_generate,
	options::OPT_fprofile_generate_EQ,
	options::OPT_fno_profile_generate);
	if (PGOGenerateArg &&
	PGOGenerateArg->getOption().matches(options::OPT_fno_profile_generate))
	PGOGenerateArg = nullptr;

	auto *CSPGOGenerateArg = Args.getLastArg(options::OPT_fcs_profile_generate,
	options::OPT_fcs_profile_generate_EQ,
	options::OPT_fno_profile_generate);
	if (CSPGOGenerateArg &&
	CSPGOGenerateArg->getOption().matches(options::OPT_fno_profile_generate))
	CSPGOGenerateArg = nullptr;

	auto *ProfileGenerateArg = Args.getLastArg(
	options::OPT_fprofile_instr_generate,
	options::OPT_fprofile_instr_generate_EQ,
	options::OPT_fno_profile_instr_generate);
	if (ProfileGenerateArg &&
	ProfileGenerateArg->getOption().matches(
	options::OPT_fno_profile_instr_generate))
	ProfileGenerateArg = nullptr;

	if (PGOGenerateArg && ProfileGenerateArg)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< PGOGenerateArg->getSpelling() << ProfileGenerateArg->getSpelling();

	auto *ProfileUseArg = getLastProfileUseArg(Args);

	if (PGOGenerateArg && ProfileUseArg)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< ProfileUseArg->getSpelling() << PGOGenerateArg->getSpelling();

	if (ProfileGenerateArg && ProfileUseArg)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< ProfileGenerateArg->getSpelling() << ProfileUseArg->getSpelling();

	if (CSPGOGenerateArg && PGOGenerateArg)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< CSPGOGenerateArg->getSpelling() << PGOGenerateArg->getSpelling();

	if (ProfileGenerateArg) {
	if (ProfileGenerateArg->getOption().matches(
	options::OPT_fprofile_instr_generate_EQ))
	CmdArgs.push_back(Args.MakeArgString(Twine("-fprofile-instrument-path=") +
	ProfileGenerateArg->getValue()));
	// The default is to use Clang Instrumentation.
	CmdArgs.push_back("-fprofile-instrument=clang");
	if (TC.getTriple().isWindowsMSVCEnvironment()) {
	// Add dependent lib for clang_rt.profile
	CmdArgs.push_back(Args.MakeArgString(
	"--dependent-lib=" + TC.getCompilerRTBasename(Args, "profile")));
	}
	}

	Arg *PGOGenArg = nullptr;
	if (PGOGenerateArg) {
	assert(!CSPGOGenerateArg);
	PGOGenArg = PGOGenerateArg;
	CmdArgs.push_back("-fprofile-instrument=llvm");
	}
	if (CSPGOGenerateArg) {
	assert(!PGOGenerateArg);
	PGOGenArg = CSPGOGenerateArg;
	CmdArgs.push_back("-fprofile-instrument=csllvm");
	}
	if (PGOGenArg) {
	if (TC.getTriple().isWindowsMSVCEnvironment()) {
	// Add dependent lib for clang_rt.profile
	CmdArgs.push_back(Args.MakeArgString(
	"--dependent-lib=" + TC.getCompilerRTBasename(Args, "profile")));
	}
	if (PGOGenArg->getOption().matches(
	PGOGenerateArg ? options::OPT_fprofile_generate_EQ
	: options::OPT_fcs_profile_generate_EQ)) {
	SmallString<128> Path(PGOGenArg->getValue());
	llvm::sys::path::append(Path, "default_%m.profraw");
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-fprofile-instrument-path=") + Path));
	}
	}

	if (ProfileUseArg) {
	if (ProfileUseArg->getOption().matches(options::OPT_fprofile_instr_use_EQ))
	CmdArgs.push_back(Args.MakeArgString(
	Twine("-fprofile-instrument-use-path=") + ProfileUseArg->getValue()));
	else if ((ProfileUseArg->getOption().matches(
	options::OPT_fprofile_use_EQ) \|\|
	ProfileUseArg->getOption().matches(
	options::OPT_fprofile_instr_use))) {
	SmallString<128> Path(
	ProfileUseArg->getNumValues() == 0 ? "" : ProfileUseArg->getValue());
	if (Path.empty() \|\| llvm::sys::fs::is_directory(Path))
	llvm::sys::path::append(Path, "default.profdata");
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-fprofile-instrument-use-path=") + Path));
	}
	}

	bool EmitCovNotes = Args.hasFlag(options::OPT_ftest_coverage,
	options::OPT_fno_test_coverage, false) \|\|
	Args.hasArg(options::OPT_coverage);
	bool EmitCovData = TC.needsGCovInstrumentation(Args);
	if (EmitCovNotes)
	CmdArgs.push_back("-femit-coverage-notes");
	if (EmitCovData)
	CmdArgs.push_back("-femit-coverage-data");

	if (Args.hasFlag(options::OPT_fcoverage_mapping,
	options::OPT_fno_coverage_mapping, false)) {
	if (!ProfileGenerateArg)
	D.Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< "-fcoverage-mapping"
	<< "-fprofile-instr-generate";

	CmdArgs.push_back("-fcoverage-mapping");
	}

	if (Args.hasArg(options::OPT_fprofile_exclude_files_EQ)) {
	auto *Arg = Args.getLastArg(options::OPT_fprofile_exclude_files_EQ);
	if (!Args.hasArg(options::OPT_coverage))
	D.Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< "-fprofile-exclude-files="
	<< "--coverage";

	StringRef v = Arg->getValue();
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-fprofile-exclude-files=" + v)));
	}

	if (Args.hasArg(options::OPT_fprofile_filter_files_EQ)) {
	auto *Arg = Args.getLastArg(options::OPT_fprofile_filter_files_EQ);
	if (!Args.hasArg(options::OPT_coverage))
	D.Diag(clang::diag::err_drv_argument_only_allowed_with)
	<< "-fprofile-filter-files="
	<< "--coverage";

	StringRef v = Arg->getValue();
	CmdArgs.push_back(Args.MakeArgString(Twine("-fprofile-filter-files=" + v)));
	}

	// Leave -fprofile-dir= an unused argument unless .gcda emission is
	// enabled. To be polite, with '-fprofile-arcs -fno-profile-arcs' consider
	// the flag used. There is no -fno-profile-dir, so the user has no
	// targeted way to suppress the warning.
	Arg *FProfileDir = nullptr;
	if (Args.hasArg(options::OPT_fprofile_arcs) \|\|
	Args.hasArg(options::OPT_coverage))
	FProfileDir = Args.getLastArg(options::OPT_fprofile_dir);

	// Put the .gcno and .gcda files (if needed) next to the object file or
	// bitcode file in the case of LTO.
	// FIXME: There should be a simpler way to find the object file for this
	// input, and this code probably does the wrong thing for commands that
	// compile and link all at once.
	if ((Args.hasArg(options::OPT_c) \|\| Args.hasArg(options::OPT_S)) &&
	(EmitCovNotes \|\| EmitCovData) && Output.isFilename()) {
	SmallString<128> OutputFilename;
	if (Arg *FinalOutput = C.getArgs().getLastArg(options::OPT__SLASH_Fo))
	OutputFilename = FinalOutput->getValue();
	else if (Arg *FinalOutput = C.getArgs().getLastArg(options::OPT_o))
	OutputFilename = FinalOutput->getValue();
	else
	OutputFilename = llvm::sys::path::filename(Output.getBaseInput());
	SmallString<128> CoverageFilename = OutputFilename;
	if (llvm::sys::path::is_relative(CoverageFilename))
	(void)D.getVFS().makeAbsolute(CoverageFilename);
	llvm::sys::path::replace_extension(CoverageFilename, "gcno");

	CmdArgs.push_back("-coverage-notes-file");
	CmdArgs.push_back(Args.MakeArgString(CoverageFilename));

	if (EmitCovData) {
	if (FProfileDir) {
	CoverageFilename = FProfileDir->getValue();
	llvm::sys::path::append(CoverageFilename, OutputFilename);
	}
	llvm::sys::path::replace_extension(CoverageFilename, "gcda");
	CmdArgs.push_back("-coverage-data-file");
	CmdArgs.push_back(Args.MakeArgString(CoverageFilename));
	}
	}
	}

	/// Check whether the given input tree contains any compilation actions.
	static bool ContainsCompileAction(const Action *A) {
	if (isa<CompileJobAction>(A) \|\| isa<BackendJobAction>(A))
	return true;

	for (const auto &AI : A->inputs())
	if (ContainsCompileAction(AI))
	return true;

	return false;
	}

	/// Check if -relax-all should be passed to the internal assembler.
	/// This is done by default when compiling non-assembler source with -O0.
	static bool UseRelaxAll(Compilation &C, const ArgList &Args) {
	bool RelaxDefault = true;

	if (Arg *A = Args.getLastArg(options::OPT_O_Group))
	RelaxDefault = A->getOption().matches(options::OPT_O0);

	if (RelaxDefault) {
	RelaxDefault = false;
	for (const auto &Act : C.getActions()) {
	if (ContainsCompileAction(Act)) {
	RelaxDefault = true;
	break;
	}
	}
	}

	return Args.hasFlag(options::OPT_mrelax_all, options::OPT_mno_relax_all,
	RelaxDefault);
	}

	// Extract the integer N from a string spelled "-dwarf-N", returning 0
	// on mismatch. The StringRef input (rather than an Arg) allows
	// for use by the "-Xassembler" option parser.
	static unsigned DwarfVersionNum(StringRef ArgValue) {
	return llvm::StringSwitch<unsigned>(ArgValue)
	.Case("-gdwarf-2", 2)
	.Case("-gdwarf-3", 3)
	.Case("-gdwarf-4", 4)
	.Case("-gdwarf-5", 5)
	.Default(0);
	}

	static void RenderDebugEnablingArgs(const ArgList &Args, ArgStringList &CmdArgs,
	codegenoptions::DebugInfoKind DebugInfoKind,
	unsigned DwarfVersion,
	llvm::DebuggerKind DebuggerTuning) {
	switch (DebugInfoKind) {
	case codegenoptions::DebugDirectivesOnly:
	CmdArgs.push_back("-debug-info-kind=line-directives-only");
	break;
	case codegenoptions::DebugLineTablesOnly:
	CmdArgs.push_back("-debug-info-kind=line-tables-only");
	break;
	case codegenoptions::DebugInfoConstructor:
	CmdArgs.push_back("-debug-info-kind=constructor");
	break;
	case codegenoptions::LimitedDebugInfo:
	CmdArgs.push_back("-debug-info-kind=limited");
	break;
	case codegenoptions::FullDebugInfo:
	CmdArgs.push_back("-debug-info-kind=standalone");
	break;
	default:
	break;
	}
	if (DwarfVersion > 0)
	CmdArgs.push_back(
	Args.MakeArgString("-dwarf-version=" + Twine(DwarfVersion)));
	switch (DebuggerTuning) {
	case llvm::DebuggerKind::GDB:
	CmdArgs.push_back("-debugger-tuning=gdb");
	break;
	case llvm::DebuggerKind::LLDB:
	CmdArgs.push_back("-debugger-tuning=lldb");
	break;
	case llvm::DebuggerKind::SCE:
	CmdArgs.push_back("-debugger-tuning=sce");
	break;
	default:
	break;
	}
	}

	static bool checkDebugInfoOption(const Arg *A, const ArgList &Args,
	const Driver &D, const ToolChain &TC) {
	assert(A && "Expected non-nullptr argument.");
	if (TC.supportsDebugInfoOption(A))
	return true;
	D.Diag(diag::warn_drv_unsupported_debug_info_opt_for_target)
	<< A->getAsString(Args) << TC.getTripleString();
	return false;
	}

	static void RenderDebugInfoCompressionArgs(const ArgList &Args,
	ArgStringList &CmdArgs,
	const Driver &D,
	const ToolChain &TC) {
	const Arg *A = Args.getLastArg(options::OPT_gz, options::OPT_gz_EQ);
	if (!A)
	return;
	if (checkDebugInfoOption(A, Args, D, TC)) {
	if (A->getOption().getID() == options::OPT_gz) {
	if (llvm::zlib::isAvailable())
	CmdArgs.push_back("--compress-debug-sections");
	else
	D.Diag(diag::warn_debug_compression_unavailable);
	return;
	}

	StringRef Value = A->getValue();
	if (Value == "none") {
	CmdArgs.push_back("--compress-debug-sections=none");
	} else if (Value == "zlib" \|\| Value == "zlib-gnu") {
	if (llvm::zlib::isAvailable()) {
	CmdArgs.push_back(
	Args.MakeArgString("--compress-debug-sections=" + Twine(Value)));
	} else {
	D.Diag(diag::warn_debug_compression_unavailable);
	}
	} else {
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	}
	}

	static const char *RelocationModelName(llvm::Reloc::Model Model) {
	switch (Model) {
	case llvm::Reloc::Static:
	return "static";
	case llvm::Reloc::PIC_:
	return "pic";
	case llvm::Reloc::DynamicNoPIC:
	return "dynamic-no-pic";
	case llvm::Reloc::ROPI:
	return "ropi";
	case llvm::Reloc::RWPI:
	return "rwpi";
	case llvm::Reloc::ROPI_RWPI:
	return "ropi-rwpi";
	}
	llvm_unreachable("Unknown Reloc::Model kind");
	}

	void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
	const Driver &D, const ArgList &Args,
	ArgStringList &CmdArgs,
	const InputInfo &Output,
	const InputInfoList &Inputs) const {
	const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();

	CheckPreprocessingOptions(D, Args);

	Args.AddLastArg(CmdArgs, options::OPT_C);
	Args.AddLastArg(CmdArgs, options::OPT_CC);

	// Handle dependency file generation.
	Arg *ArgM = Args.getLastArg(options::OPT_MM);
	if (!ArgM)
	ArgM = Args.getLastArg(options::OPT_M);
	Arg *ArgMD = Args.getLastArg(options::OPT_MMD);
	if (!ArgMD)
	ArgMD = Args.getLastArg(options::OPT_MD);

	// -M and -MM imply -w.
	if (ArgM)
	CmdArgs.push_back("-w");
	else
	ArgM = ArgMD;

	if (ArgM) {
	// Determine the output location.
	const char *DepFile;
	if (Arg *MF = Args.getLastArg(options::OPT_MF)) {
	DepFile = MF->getValue();
	C.addFailureResultFile(DepFile, &JA);
	} else if (Output.getType() == types::TY_Dependencies) {
	DepFile = Output.getFilename();
	} else if (!ArgMD) {
	DepFile = "-";
	} else {
	DepFile = getDependencyFileName(Args, Inputs);
	C.addFailureResultFile(DepFile, &JA);
	}
	CmdArgs.push_back("-dependency-file");
	CmdArgs.push_back(DepFile);

	bool HasTarget = false;
	for (const Arg *A : Args.filtered(options::OPT_MT, options::OPT_MQ)) {
	HasTarget = true;
	A->claim();
	if (A->getOption().matches(options::OPT_MT)) {
	A->render(Args, CmdArgs);
	} else {
	CmdArgs.push_back("-MT");
	SmallString<128> Quoted;
	QuoteTarget(A->getValue(), Quoted);
	CmdArgs.push_back(Args.MakeArgString(Quoted));
	}
	}

	// Add a default target if one wasn't specified.
	if (!HasTarget) {
	const char *DepTarget;

	// If user provided -o, that is the dependency target, except
	// when we are only generating a dependency file.
	Arg *OutputOpt = Args.getLastArg(options::OPT_o);
	if (OutputOpt && Output.getType() != types::TY_Dependencies) {
	DepTarget = OutputOpt->getValue();
	} else {
	// Otherwise derive from the base input.
	//
	// FIXME: This should use the computed output file location.
	SmallString<128> P(Inputs[0].getBaseInput());
	llvm::sys::path::replace_extension(P, "o");
	DepTarget = Args.MakeArgString(llvm::sys::path::filename(P));
	}

	CmdArgs.push_back("-MT");
	SmallString<128> Quoted;
	QuoteTarget(DepTarget, Quoted);
	CmdArgs.push_back(Args.MakeArgString(Quoted));
	}

	if (ArgM->getOption().matches(options::OPT_M) \|\|
	ArgM->getOption().matches(options::OPT_MD))
	CmdArgs.push_back("-sys-header-deps");
	if ((isa<PrecompileJobAction>(JA) &&
	!Args.hasArg(options::OPT_fno_module_file_deps)) \|\|
	Args.hasArg(options::OPT_fmodule_file_deps))
	CmdArgs.push_back("-module-file-deps");
	}

	if (Args.hasArg(options::OPT_MG)) {
	if (!ArgM \|\| ArgM->getOption().matches(options::OPT_MD) \|\|
	ArgM->getOption().matches(options::OPT_MMD))
	D.Diag(diag::err_drv_mg_requires_m_or_mm);
	CmdArgs.push_back("-MG");
	}

	Args.AddLastArg(CmdArgs, options::OPT_MP);
	Args.AddLastArg(CmdArgs, options::OPT_MV);

	// Add offload include arguments specific for CUDA/HIP. This must happen
	// before we -I or -include anything else, because we must pick up the
	// CUDA/HIP headers from the particular CUDA/ROCm installation, rather than
	// from e.g. /usr/local/include.
	if (JA.isOffloading(Action::OFK_Cuda))
	getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
	if (JA.isOffloading(Action::OFK_HIP))
	getToolChain().AddHIPIncludeArgs(Args, CmdArgs);

	// If we are offloading to a target via OpenMP we need to include the
	// openmp_wrappers folder which contains alternative system headers.
	if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
	getToolChain().getTriple().isNVPTX()){
	if (!Args.hasArg(options::OPT_nobuiltininc)) {
	// Add openmp_wrappers/* to our system include path. This lets us wrap
	// standard library headers.
	SmallString<128> P(D.ResourceDir);
	llvm::sys::path::append(P, "include");
	llvm::sys::path::append(P, "openmp_wrappers");
	CmdArgs.push_back("-internal-isystem");
	CmdArgs.push_back(Args.MakeArgString(P));
	}

	CmdArgs.push_back("-include");
	CmdArgs.push_back("__clang_openmp_device_functions.h");
	}

	// Add -i* options, and automatically translate to
	// -include-pch/-include-pth for transparent PCH support. It's
	// wonky, but we include looking for .gch so we can support seamless
	// replacement into a build system already set up to be generating
	// .gch files.

	if (getToolChain().getDriver().IsCLMode()) {
	const Arg *YcArg = Args.getLastArg(options::OPT__SLASH_Yc);
	const Arg *YuArg = Args.getLastArg(options::OPT__SLASH_Yu);
	if (YcArg && JA.getKind() >= Action::PrecompileJobClass &&
	JA.getKind() <= Action::AssembleJobClass) {
	CmdArgs.push_back(Args.MakeArgString("-building-pch-with-obj"));
	CmdArgs.push_back(Args.MakeArgString("-fpch-instantiate-templates"));
	}
	if (YcArg \|\| YuArg) {
	StringRef ThroughHeader = YcArg ? YcArg->getValue() : YuArg->getValue();
	if (!isa<PrecompileJobAction>(JA)) {
	CmdArgs.push_back("-include-pch");
	CmdArgs.push_back(Args.MakeArgString(D.GetClPchPath(
	C, !ThroughHeader.empty()
	? ThroughHeader
	: llvm::sys::path::filename(Inputs[0].getBaseInput()))));
	}

	if (ThroughHeader.empty()) {
	CmdArgs.push_back(Args.MakeArgString(
	Twine("-pch-through-hdrstop-") + (YcArg ? "create" : "use")));
	} else {
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-pch-through-header=") + ThroughHeader));
	}
	}
	}

	bool RenderedImplicitInclude = false;
	for (const Arg *A : Args.filtered(options::OPT_clang_i_Group)) {
	if (A->getOption().matches(options::OPT_include)) {
	// Handling of gcc-style gch precompiled headers.
	bool IsFirstImplicitInclude = !RenderedImplicitInclude;
	RenderedImplicitInclude = true;

	bool FoundPCH = false;
	SmallString<128> P(A->getValue());
	// We want the files to have a name like foo.h.pch. Add a dummy extension
	// so that replace_extension does the right thing.
	P += ".dummy";
	llvm::sys::path::replace_extension(P, "pch");
	if (llvm::sys::fs::exists(P))
	FoundPCH = true;

	if (!FoundPCH) {
	llvm::sys::path::replace_extension(P, "gch");
	if (llvm::sys::fs::exists(P)) {
	FoundPCH = true;
	}
	}

	if (FoundPCH) {
	if (IsFirstImplicitInclude) {
	A->claim();
	CmdArgs.push_back("-include-pch");
	CmdArgs.push_back(Args.MakeArgString(P));
	continue;
	} else {
	// Ignore the PCH if not first on command line and emit warning.
	D.Diag(diag::warn_drv_pch_not_first_include) << P
	<< A->getAsString(Args);
	}
	}
	} else if (A->getOption().matches(options::OPT_isystem_after)) {
	// Handling of paths which must come late. These entries are handled by
	// the toolchain itself after the resource dir is inserted in the right
	// search order.
	// Do not claim the argument so that the use of the argument does not
	// silently go unnoticed on toolchains which do not honour the option.
	continue;
	} else if (A->getOption().matches(options::OPT_stdlibxx_isystem)) {
	// Translated to -internal-isystem by the driver, no need to pass to cc1.
	continue;
	}

	// Not translated, render as usual.
	A->claim();
	A->render(Args, CmdArgs);
	}

	Args.AddAllArgs(CmdArgs,
	{options::OPT_D, options::OPT_U, options::OPT_I_Group,
	options::OPT_F, options::OPT_index_header_map});

	// Add -Wp, and -Xpreprocessor if using the preprocessor.

	// FIXME: There is a very unfortunate problem here, some troubled
	// souls abuse -Wp, to pass preprocessor options in gcc syntax. To
	// really support that we would have to parse and then translate
	// those options. :(
	Args.AddAllArgValues(CmdArgs, options::OPT_Wp_COMMA,
	options::OPT_Xpreprocessor);

	// -I- is a deprecated GCC feature, reject it.
	if (Arg *A = Args.getLastArg(options::OPT_I_))
	D.Diag(diag::err_drv_I_dash_not_supported) << A->getAsString(Args);

	// If we have a --sysroot, and don't have an explicit -isysroot flag, add an
	// -isysroot to the CC1 invocation.
	StringRef sysroot = C.getSysRoot();
	if (sysroot != "") {
	if (!Args.hasArg(options::OPT_isysroot)) {
	CmdArgs.push_back("-isysroot");
	CmdArgs.push_back(C.getArgs().MakeArgString(sysroot));
	}
	}

	// Parse additional include paths from environment variables.
	// FIXME: We should probably sink the logic for handling these from the
	// frontend into the driver. It will allow deleting 4 otherwise unused flags.
	// CPATH - included following the user specified includes (but prior to
	// builtin and standard includes).
	addDirectoryList(Args, CmdArgs, "-I", "CPATH");
	// C_INCLUDE_PATH - system includes enabled when compiling C.
	addDirectoryList(Args, CmdArgs, "-c-isystem", "C_INCLUDE_PATH");
	// CPLUS_INCLUDE_PATH - system includes enabled when compiling C++.
	addDirectoryList(Args, CmdArgs, "-cxx-isystem", "CPLUS_INCLUDE_PATH");
	// OBJC_INCLUDE_PATH - system includes enabled when compiling ObjC.
	addDirectoryList(Args, CmdArgs, "-objc-isystem", "OBJC_INCLUDE_PATH");
	// OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
	addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");

	// While adding the include arguments, we also attempt to retrieve the
	// arguments of related offloading toolchains or arguments that are specific
	// of an offloading programming model.

	// Add C++ include arguments, if needed.
	if (types::isCXX(Inputs[0].getType())) {
	bool HasStdlibxxIsystem = Args.hasArg(options::OPT_stdlibxx_isystem);
	forAllAssociatedToolChains(
	C, JA, getToolChain(),
	[&Args, &CmdArgs, HasStdlibxxIsystem](const ToolChain &TC) {
	HasStdlibxxIsystem ? TC.AddClangCXXStdlibIsystemArgs(Args, CmdArgs)
	: TC.AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
	});
	}

	// Add system include arguments for all targets but IAMCU.
	if (!IsIAMCU)
	forAllAssociatedToolChains(C, JA, getToolChain(),
	[&Args, &CmdArgs](const ToolChain &TC) {
	TC.AddClangSystemIncludeArgs(Args, CmdArgs);
	});
	else {
	// For IAMCU add special include arguments.
	getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
	}

	addMacroPrefixMapArg(D, Args, CmdArgs);
	}

	// FIXME: Move to target hook.
	static bool isSignedCharDefault(const llvm::Triple &Triple) {
	switch (Triple.getArch()) {
	default:
	return true;

	case llvm::Triple::aarch64:
	case llvm::Triple::aarch64_32:
	case llvm::Triple::aarch64_be:
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	if (Triple.isOSDarwin() \|\| Triple.isOSWindows())
	return true;
	return false;

	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	if (Triple.isOSDarwin())
	return true;
	return false;

	case llvm::Triple::hexagon:
	case llvm::Triple::ppc64le:
	case llvm::Triple::riscv32:
	case llvm::Triple::riscv64:
	case llvm::Triple::systemz:
	case llvm::Triple::xcore:
	return false;
	}
	}

	static bool hasMultipleInvocations(const llvm::Triple &Triple,
	const ArgList &Args) {
	// Supported only on Darwin where we invoke the compiler multiple times
	// followed by an invocation to lipo.
	if (!Triple.isOSDarwin())
	return false;
	// If more than one "-arch <arch>" is specified, we're targeting multiple
	// architectures resulting in a fat binary.
	return Args.getAllArgValues(options::OPT_arch).size() > 1;
	}

	static bool checkRemarksOptions(const Driver &D, const ArgList &Args,
	const llvm::Triple &Triple) {
	// When enabling remarks, we need to error if:
	// * The remark file is specified but we're targeting multiple architectures,
	// which means more than one remark file is being generated.
	bool hasMultipleInvocations = ::hasMultipleInvocations(Triple, Args);
	bool hasExplicitOutputFile =
	Args.getLastArg(options::OPT_foptimization_record_file_EQ);
	if (hasMultipleInvocations && hasExplicitOutputFile) {
	D.Diag(diag::err_drv_invalid_output_with_multiple_archs)
	<< "-foptimization-record-file";
	return false;
	}
	return true;
	}

	static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
	const llvm::Triple &Triple,
	const InputInfo &Input,
	const InputInfo &Output, const JobAction &JA) {
	StringRef Format = "yaml";
	if (const Arg *A = Args.getLastArg(options::OPT_fsave_optimization_record_EQ))
	Format = A->getValue();

	CmdArgs.push_back("-opt-record-file");

	const Arg *A = Args.getLastArg(options::OPT_foptimization_record_file_EQ);
	if (A) {
	CmdArgs.push_back(A->getValue());
	} else {
	bool hasMultipleArchs =
	Triple.isOSDarwin() && // Only supported on Darwin platforms.
	Args.getAllArgValues(options::OPT_arch).size() > 1;

	SmallString<128> F;

	if (Args.hasArg(options::OPT_c) \|\| Args.hasArg(options::OPT_S)) {
	if (Arg *FinalOutput = Args.getLastArg(options::OPT_o))
	F = FinalOutput->getValue();
	} else {
	if (Format != "yaml" && // For YAML, keep the original behavior.
	Triple.isOSDarwin() && // Enable this only on darwin, since it's the only platform supporting .dSYM bundles.
	Output.isFilename())
	F = Output.getFilename();
	}

	if (F.empty()) {
	// Use the input filename.
	F = llvm::sys::path::stem(Input.getBaseInput());

	// If we're compiling for an offload architecture (i.e. a CUDA device),
	// we need to make the file name for the device compilation different
	// from the host compilation.
	if (!JA.isDeviceOffloading(Action::OFK_None) &&
	!JA.isDeviceOffloading(Action::OFK_Host)) {
	llvm::sys::path::replace_extension(F, "");
	F += Action::GetOffloadingFileNamePrefix(JA.getOffloadingDeviceKind(),
	Triple.normalize());
	F += "-";
	F += JA.getOffloadingArch();
	}
	}

	// If we're having more than one "-arch", we should name the files
	// differently so that every cc1 invocation writes to a different file.
	// We're doing that by appending "-<arch>" with "<arch>" being the arch
	// name from the triple.
	if (hasMultipleArchs) {
	// First, remember the extension.
	SmallString<64> OldExtension = llvm::sys::path::extension(F);
	// then, remove it.
	llvm::sys::path::replace_extension(F, "");
	// attach -<arch> to it.
	F += "-";
	F += Triple.getArchName();
	// put back the extension.
	llvm::sys::path::replace_extension(F, OldExtension);
	}

	SmallString<32> Extension;
	Extension += "opt.";
	Extension += Format;

	llvm::sys::path::replace_extension(F, Extension);
	CmdArgs.push_back(Args.MakeArgString(F));
	}

	if (const Arg *A =
	Args.getLastArg(options::OPT_foptimization_record_passes_EQ)) {
	CmdArgs.push_back("-opt-record-passes");
	CmdArgs.push_back(A->getValue());
	}

	if (!Format.empty()) {
	CmdArgs.push_back("-opt-record-format");
	CmdArgs.push_back(Format.data());
	}
	}

	namespace {
	void RenderARMABI(const llvm::Triple &Triple, const ArgList &Args,
	ArgStringList &CmdArgs) {
	// Select the ABI to use.
	// FIXME: Support -meabi.
	// FIXME: Parts of this are duplicated in the backend, unify this somehow.
	const char *ABIName = nullptr;
	if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
	ABIName = A->getValue();
	} else {
	std::string CPU = getCPUName(Args, Triple, /FromAs/ false);
	ABIName = llvm::ARM::computeDefaultTargetABI(Triple, CPU).data();
	}

	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName);
	}
	}

	void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
	ArgStringList &CmdArgs, bool KernelOrKext) const {
	RenderARMABI(Triple, Args, CmdArgs);

	// Determine floating point ABI from the options & target defaults.
	arm::FloatABI ABI = arm::getARMFloatABI(getToolChain(), Args);
	if (ABI == arm::FloatABI::Soft) {
	// Floating point operations and argument passing are soft.
	// FIXME: This changes CPP defines, we need -target-soft-float.
	CmdArgs.push_back("-msoft-float");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	} else if (ABI == arm::FloatABI::SoftFP) {
	// Floating point operations are hard, but argument passing is soft.
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	} else {
	// Floating point operations and argument passing are hard.
	assert(ABI == arm::FloatABI::Hard && "Invalid float abi!");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("hard");
	}

	// Forward the -mglobal-merge option for explicit control over the pass.
	if (Arg *A = Args.getLastArg(options::OPT_mglobal_merge,
	options::OPT_mno_global_merge)) {
	CmdArgs.push_back("-mllvm");
	if (A->getOption().matches(options::OPT_mno_global_merge))
	CmdArgs.push_back("-arm-global-merge=false");
	else
	CmdArgs.push_back("-arm-global-merge=true");
	}

	if (!Args.hasFlag(options::OPT_mimplicit_float,
	options::OPT_mno_implicit_float, true))
	CmdArgs.push_back("-no-implicit-float");

	if (Args.getLastArg(options::OPT_mcmse))
	CmdArgs.push_back("-mcmse");
	}

	void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
	const ArgList &Args, bool KernelOrKext,
	ArgStringList &CmdArgs) const {
	const ToolChain &TC = getToolChain();

	// Add the target features
	getTargetFeatures(TC.getDriver(), EffectiveTriple, Args, CmdArgs, false);

	// Add target specific flags.
	switch (TC.getArch()) {
	default:
	break;

	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	// Use the effective triple, which takes into account the deployment target.
	AddARMTargetArgs(EffectiveTriple, Args, CmdArgs, KernelOrKext);
	CmdArgs.push_back("-fallow-half-arguments-and-returns");
	break;

	case llvm::Triple::aarch64:
	case llvm::Triple::aarch64_32:
	case llvm::Triple::aarch64_be:
	AddAArch64TargetArgs(Args, CmdArgs);
	CmdArgs.push_back("-fallow-half-arguments-and-returns");
	break;

	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	AddMIPSTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	AddPPCTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::riscv32:
	case llvm::Triple::riscv64:
	AddRISCVTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::sparc:
	case llvm::Triple::sparcel:
	case llvm::Triple::sparcv9:
	AddSparcTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::systemz:
	AddSystemZTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	AddX86TargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::lanai:
	AddLanaiTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::hexagon:
	AddHexagonTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::wasm32:
	case llvm::Triple::wasm64:
	AddWebAssemblyTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::ve:
	AddVETargetArgs(Args, CmdArgs);
	break;
	}
	}

	namespace {
	void RenderAArch64ABI(const llvm::Triple &Triple, const ArgList &Args,
	ArgStringList &CmdArgs) {
	const char *ABIName = nullptr;
	if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
	ABIName = A->getValue();
	else if (Triple.isOSDarwin())
	ABIName = "darwinpcs";
	else
	ABIName = "aapcs";

	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName);
	}
	}

	void Clang::AddAArch64TargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	const llvm::Triple &Triple = getToolChain().getEffectiveTriple();

	if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) \|\|
	Args.hasArg(options::OPT_mkernel) \|\|
	Args.hasArg(options::OPT_fapple_kext))
	CmdArgs.push_back("-disable-red-zone");

	if (!Args.hasFlag(options::OPT_mimplicit_float,
	options::OPT_mno_implicit_float, true))
	CmdArgs.push_back("-no-implicit-float");

	RenderAArch64ABI(Triple, Args, CmdArgs);

	if (Arg *A = Args.getLastArg(options::OPT_mfix_cortex_a53_835769,
	options::OPT_mno_fix_cortex_a53_835769)) {
	CmdArgs.push_back("-mllvm");
	if (A->getOption().matches(options::OPT_mfix_cortex_a53_835769))
	CmdArgs.push_back("-aarch64-fix-cortex-a53-835769=1");
	else
	CmdArgs.push_back("-aarch64-fix-cortex-a53-835769=0");
	} else if (Triple.isAndroid()) {
	// Enabled A53 errata (835769) workaround by default on android
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-aarch64-fix-cortex-a53-835769=1");
	}

	// Forward the -mglobal-merge option for explicit control over the pass.
	if (Arg *A = Args.getLastArg(options::OPT_mglobal_merge,
	options::OPT_mno_global_merge)) {
	CmdArgs.push_back("-mllvm");
	if (A->getOption().matches(options::OPT_mno_global_merge))
	CmdArgs.push_back("-aarch64-enable-global-merge=false");
	else
	CmdArgs.push_back("-aarch64-enable-global-merge=true");
	}

	// Enable/disable return address signing and indirect branch targets.
	if (Arg *A = Args.getLastArg(options::OPT_msign_return_address_EQ,
	options::OPT_mbranch_protection_EQ)) {

	const Driver &D = getToolChain().getDriver();

	StringRef Scope, Key;
	bool IndirectBranches;

	if (A->getOption().matches(options::OPT_msign_return_address_EQ)) {
	Scope = A->getValue();
	if (!Scope.equals("none") && !Scope.equals("non-leaf") &&
	!Scope.equals("all"))
	D.Diag(diag::err_invalid_branch_protection)
	<< Scope << A->getAsString(Args);
	Key = "a_key";
	IndirectBranches = false;
	} else {
	StringRef Err;
	llvm::AArch64::ParsedBranchProtection PBP;
	if (!llvm::AArch64::parseBranchProtection(A->getValue(), PBP, Err))
	D.Diag(diag::err_invalid_branch_protection)
	<< Err << A->getAsString(Args);
	Scope = PBP.Scope;
	Key = PBP.Key;
	IndirectBranches = PBP.BranchTargetEnforcement;
	}

	CmdArgs.push_back(
	Args.MakeArgString(Twine("-msign-return-address=") + Scope));
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
	if (IndirectBranches)
	CmdArgs.push_back("-mbranch-target-enforce");
	}
	}

	void Clang::AddMIPSTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	const Driver &D = getToolChain().getDriver();
	StringRef CPUName;
	StringRef ABIName;
	const llvm::Triple &Triple = getToolChain().getTriple();
	mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName);

	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName.data());

	mips::FloatABI ABI = mips::getMipsFloatABI(D, Args, Triple);
	if (ABI == mips::FloatABI::Soft) {
	// Floating point operations and argument passing are soft.
	CmdArgs.push_back("-msoft-float");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	} else {
	// Floating point operations and argument passing are hard.
	assert(ABI == mips::FloatABI::Hard && "Invalid float abi!");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("hard");
	}

	if (Arg *A = Args.getLastArg(options::OPT_mldc1_sdc1,
	options::OPT_mno_ldc1_sdc1)) {
	if (A->getOption().matches(options::OPT_mno_ldc1_sdc1)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-mno-ldc1-sdc1");
	}
	}

	if (Arg *A = Args.getLastArg(options::OPT_mcheck_zero_division,
	options::OPT_mno_check_zero_division)) {
	if (A->getOption().matches(options::OPT_mno_check_zero_division)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-mno-check-zero-division");
	}
	}

	if (Arg *A = Args.getLastArg(options::OPT_G)) {
	StringRef v = A->getValue();
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-mips-ssection-threshold=" + v));
	A->claim();
	}

	Arg *GPOpt = Args.getLastArg(options::OPT_mgpopt, options::OPT_mno_gpopt);
	Arg *ABICalls =
	Args.getLastArg(options::OPT_mabicalls, options::OPT_mno_abicalls);

	// -mabicalls is the default for many MIPS environments, even with -fno-pic.
	// -mgpopt is the default for static, -fno-pic environments but these two
	// options conflict. We want to be certain that -mno-abicalls -mgpopt is
	// the only case where -mllvm -mgpopt is passed.
	// NOTE: We need a warning here or in the backend to warn when -mgpopt is
	// passed explicitly when compiling something with -mabicalls
	// (implictly) in affect. Currently the warning is in the backend.
	//
	// When the ABI in use is N64, we also need to determine the PIC mode that
	// is in use, as -fno-pic for N64 implies -mno-abicalls.
	bool NoABICalls =
	ABICalls && ABICalls->getOption().matches(options::OPT_mno_abicalls);

	llvm::Reloc::Model RelocationModel;
	unsigned PICLevel;
	bool IsPIE;
	std::tie(RelocationModel, PICLevel, IsPIE) =
	ParsePICArgs(getToolChain(), Args);

	NoABICalls = NoABICalls \|\|
	(RelocationModel == llvm::Reloc::Static && ABIName == "n64");

	bool WantGPOpt = GPOpt && GPOpt->getOption().matches(options::OPT_mgpopt);
	// We quietly ignore -mno-gpopt as the backend defaults to -mno-gpopt.
	if (NoABICalls && (!GPOpt \|\| WantGPOpt)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-mgpopt");

	Arg *LocalSData = Args.getLastArg(options::OPT_mlocal_sdata,
	options::OPT_mno_local_sdata);
	Arg *ExternSData = Args.getLastArg(options::OPT_mextern_sdata,
	options::OPT_mno_extern_sdata);
	Arg *EmbeddedData = Args.getLastArg(options::OPT_membedded_data,
	options::OPT_mno_embedded_data);
	if (LocalSData) {
	CmdArgs.push_back("-mllvm");
	if (LocalSData->getOption().matches(options::OPT_mlocal_sdata)) {
	CmdArgs.push_back("-mlocal-sdata=1");
	} else {
	CmdArgs.push_back("-mlocal-sdata=0");
	}
	LocalSData->claim();
	}

	if (ExternSData) {
	CmdArgs.push_back("-mllvm");
	if (ExternSData->getOption().matches(options::OPT_mextern_sdata)) {
	CmdArgs.push_back("-mextern-sdata=1");
	} else {
	CmdArgs.push_back("-mextern-sdata=0");
	}
	ExternSData->claim();
	}

	if (EmbeddedData) {
	CmdArgs.push_back("-mllvm");
	if (EmbeddedData->getOption().matches(options::OPT_membedded_data)) {
	CmdArgs.push_back("-membedded-data=1");
	} else {
	CmdArgs.push_back("-membedded-data=0");
	}
	EmbeddedData->claim();
	}

	} else if ((!ABICalls \|\| (!NoABICalls && ABICalls)) && WantGPOpt)
	D.Diag(diag::warn_drv_unsupported_gpopt) << (ABICalls ? 0 : 1);

	if (GPOpt)
	GPOpt->claim();

	if (Arg *A = Args.getLastArg(options::OPT_mcompact_branches_EQ)) {
	StringRef Val = StringRef(A->getValue());
	if (mips::hasCompactBranches(CPUName)) {
	if (Val == "never" \|\| Val == "always" \|\| Val == "optimal") {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-mips-compact-branches=" + Val));
	} else
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Val;
	} else
	D.Diag(diag::warn_target_unsupported_compact_branches) << CPUName;
	}

	if (Arg *A = Args.getLastArg(options::OPT_mrelax_pic_calls,
	options::OPT_mno_relax_pic_calls)) {
	if (A->getOption().matches(options::OPT_mno_relax_pic_calls)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-mips-jalr-reloc=0");
	}
	}
	}

	void Clang::AddPPCTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	// Select the ABI to use.
	const char *ABIName = nullptr;
	const llvm::Triple &T = getToolChain().getTriple();
	if (T.isOSBinFormatELF()) {
	switch (getToolChain().getArch()) {
	case llvm::Triple::ppc64: {
	// When targeting a processor that supports QPX, or if QPX is
	// specifically enabled, default to using the ABI that supports QPX (so
	// long as it is not specifically disabled).
	bool HasQPX = false;
	if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
	HasQPX = A->getValue() == StringRef("a2q");
	HasQPX = Args.hasFlag(options::OPT_mqpx, options::OPT_mno_qpx, HasQPX);
	if (HasQPX) {
	ABIName = "elfv1-qpx";
	break;
	}
	-
	- if (T.isMusl() \|\| (T.isOSFreeBSD() && T.getOSMajorVersion() >= 13))
	+ if ((T.isOSFreeBSD() && T.getOSMajorVersion() >= 13) \|\|
	+ T.isOSOpenBSD() \|\| T.isMusl())
	ABIName = "elfv2";
	else
	ABIName = "elfv1";
	break;
	}
	case llvm::Triple::ppc64le:
	ABIName = "elfv2";
	break;
	default:
	break;
	}
	}

	bool IEEELongDouble = false;
	for (const Arg *A : Args.filtered(options::OPT_mabi_EQ)) {
	StringRef V = A->getValue();
	if (V == "ieeelongdouble")
	IEEELongDouble = true;
	else if (V == "ibmlongdouble")
	IEEELongDouble = false;
	else if (V != "altivec")
	// The ppc64 linux abis are all "altivec" abis by default. Accept and ignore
	// the option if given as we don't have backend support for any targets
	// that don't use the altivec abi.
	ABIName = A->getValue();
	}
	if (IEEELongDouble)
	CmdArgs.push_back("-mabi=ieeelongdouble");

	ppc::FloatABI FloatABI =
	ppc::getPPCFloatABI(getToolChain().getDriver(), Args);

	if (FloatABI == ppc::FloatABI::Soft) {
	// Floating point operations and argument passing are soft.
	CmdArgs.push_back("-msoft-float");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	} else {
	// Floating point operations and argument passing are hard.
	assert(FloatABI == ppc::FloatABI::Hard && "Invalid float abi!");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("hard");
	}

	if (ABIName) {
	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName);
	}
	}

	static void SetRISCVSmallDataLimit(const ToolChain &TC, const ArgList &Args,
	ArgStringList &CmdArgs) {
	const Driver &D = TC.getDriver();
	const llvm::Triple &Triple = TC.getTriple();
	// Default small data limitation is eight.
	const char *SmallDataLimit = "8";
	// Get small data limitation.
	if (Args.getLastArg(options::OPT_shared, options::OPT_fpic,
	options::OPT_fPIC)) {
	// Not support linker relaxation for PIC.
	SmallDataLimit = "0";
	if (Args.hasArg(options::OPT_G)) {
	D.Diag(diag::warn_drv_unsupported_sdata);
	}
	} else if (Args.getLastArgValue(options::OPT_mcmodel_EQ)
	.equals_lower("large") &&
	(Triple.getArch() == llvm::Triple::riscv64)) {
	// Not support linker relaxation for RV64 with large code model.
	SmallDataLimit = "0";
	if (Args.hasArg(options::OPT_G)) {
	D.Diag(diag::warn_drv_unsupported_sdata);
	}
	} else if (Arg *A = Args.getLastArg(options::OPT_G)) {
	SmallDataLimit = A->getValue();
	}
	// Forward the -msmall-data-limit= option.
	CmdArgs.push_back("-msmall-data-limit");
	CmdArgs.push_back(SmallDataLimit);
	}

	void Clang::AddRISCVTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	const llvm::Triple &Triple = getToolChain().getTriple();
	StringRef ABIName = riscv::getRISCVABI(Args, Triple);

	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName.data());

	SetRISCVSmallDataLimit(getToolChain(), Args, CmdArgs);
	}

	void Clang::AddSparcTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	sparc::FloatABI FloatABI =
	sparc::getSparcFloatABI(getToolChain().getDriver(), Args);

	if (FloatABI == sparc::FloatABI::Soft) {
	// Floating point operations and argument passing are soft.
	CmdArgs.push_back("-msoft-float");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	} else {
	// Floating point operations and argument passing are hard.
	assert(FloatABI == sparc::FloatABI::Hard && "Invalid float abi!");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("hard");
	}
	}

	void Clang::AddSystemZTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	bool HasBackchain = Args.hasFlag(options::OPT_mbackchain,
	options::OPT_mno_backchain, false);
	bool HasPackedStack = Args.hasFlag(options::OPT_mpacked_stack,
	options::OPT_mno_packed_stack, false);
	systemz::FloatABI FloatABI =
	systemz::getSystemZFloatABI(getToolChain().getDriver(), Args);
	bool HasSoftFloat = (FloatABI == systemz::FloatABI::Soft);
	if (HasBackchain && HasPackedStack && !HasSoftFloat) {
	const Driver &D = getToolChain().getDriver();
	D.Diag(diag::err_drv_unsupported_opt)
	<< "-mpacked-stack -mbackchain -mhard-float";
	}
	if (HasBackchain)
	CmdArgs.push_back("-mbackchain");
	if (HasPackedStack)
	CmdArgs.push_back("-mpacked-stack");
	if (HasSoftFloat) {
	// Floating point operations and argument passing are soft.
	CmdArgs.push_back("-msoft-float");
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	}
	}

	void Clang::AddX86TargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	const Driver &D = getToolChain().getDriver();
	addX86AlignBranchArgs(D, Args, CmdArgs, /IsLTO=/false);

	if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) \|\|
	Args.hasArg(options::OPT_mkernel) \|\|
	Args.hasArg(options::OPT_fapple_kext))
	CmdArgs.push_back("-disable-red-zone");

	if (!Args.hasFlag(options::OPT_mtls_direct_seg_refs,
	options::OPT_mno_tls_direct_seg_refs, true))
	CmdArgs.push_back("-mno-tls-direct-seg-refs");

	// Default to avoid implicit floating-point for kernel/kext code, but allow
	// that to be overridden with -mno-soft-float.
	bool NoImplicitFloat = (Args.hasArg(options::OPT_mkernel) \|\|
	Args.hasArg(options::OPT_fapple_kext));
	if (Arg *A = Args.getLastArg(
	options::OPT_msoft_float, options::OPT_mno_soft_float,
	options::OPT_mimplicit_float, options::OPT_mno_implicit_float)) {
	const Option &O = A->getOption();
	NoImplicitFloat = (O.matches(options::OPT_mno_implicit_float) \|\|
	O.matches(options::OPT_msoft_float));
	}
	if (NoImplicitFloat)
	CmdArgs.push_back("-no-implicit-float");

	if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
	StringRef Value = A->getValue();
	if (Value == "intel" \|\| Value == "att") {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
	} else {
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	} else if (D.IsCLMode()) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-x86-asm-syntax=intel");
	}

	// Set flags to support MCU ABI.
	if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false)) {
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("soft");
	CmdArgs.push_back("-mstack-alignment=4");
	}
	}

	void Clang::AddHexagonTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	CmdArgs.push_back("-mqdsp6-compat");
	CmdArgs.push_back("-Wreturn-type");

	if (auto G = toolchains::HexagonToolChain::getSmallDataThreshold(Args)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-hexagon-small-data-threshold=" +
	Twine(G.getValue())));
	}

	if (!Args.hasArg(options::OPT_fno_short_enums))
	CmdArgs.push_back("-fshort-enums");
	if (Args.getLastArg(options::OPT_mieee_rnd_near)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-enable-hexagon-ieee-rnd-near");
	}
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-machine-sink-split=0");
	}

	void Clang::AddLanaiTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
	StringRef CPUName = A->getValue();

	CmdArgs.push_back("-target-cpu");
	CmdArgs.push_back(Args.MakeArgString(CPUName));
	}
	if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
	StringRef Value = A->getValue();
	// Only support mregparm=4 to support old usage. Report error for all other
	// cases.
	int Mregparm;
	if (Value.getAsInteger(10, Mregparm)) {
	if (Mregparm != 4) {
	getToolChain().getDriver().Diag(
	diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	}
	}
	}

	void Clang::AddWebAssemblyTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	// Default to "hidden" visibility.
	if (!Args.hasArg(options::OPT_fvisibility_EQ,
	options::OPT_fvisibility_ms_compat)) {
	CmdArgs.push_back("-fvisibility");
	CmdArgs.push_back("hidden");
	}
	}

	void Clang::AddVETargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const {
	// Floating point operations and argument passing are hard.
	CmdArgs.push_back("-mfloat-abi");
	CmdArgs.push_back("hard");
	}

	void Clang::DumpCompilationDatabase(Compilation &C, StringRef Filename,
	StringRef Target, const InputInfo &Output,
	const InputInfo &Input, const ArgList &Args) const {
	// If this is a dry run, do not create the compilation database file.
	if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
	return;

	using llvm::yaml::escape;
	const Driver &D = getToolChain().getDriver();

	if (!CompilationDatabase) {
	std::error_code EC;
	auto File = std::make_unique<llvm::raw_fd_ostream>(Filename, EC,
	llvm::sys::fs::OF_Text);
	if (EC) {
	D.Diag(clang::diag::err_drv_compilationdatabase) << Filename
	<< EC.message();
	return;
	}
	CompilationDatabase = std::move(File);
	}
	auto &CDB = *CompilationDatabase;
	auto CWD = D.getVFS().getCurrentWorkingDirectory();
	if (!CWD)
	CWD = ".";
	CDB << "{ \"directory\": \"" << escape(*CWD) << "\"";
	CDB << ", \"file\": \"" << escape(Input.getFilename()) << "\"";
	CDB << ", \"output\": \"" << escape(Output.getFilename()) << "\"";
	CDB << ", \"arguments\": [\"" << escape(D.ClangExecutable) << "\"";
	SmallString<128> Buf;
	Buf = "-x";
	Buf += types::getTypeName(Input.getType());
	CDB << ", \"" << escape(Buf) << "\"";
	if (!D.SysRoot.empty() && !Args.hasArg(options::OPT__sysroot_EQ)) {
	Buf = "--sysroot=";
	Buf += D.SysRoot;
	CDB << ", \"" << escape(Buf) << "\"";
	}
	CDB << ", \"" << escape(Input.getFilename()) << "\"";
	for (auto &A: Args) {
	auto &O = A->getOption();
	// Skip language selection, which is positional.
	if (O.getID() == options::OPT_x)
	continue;
	// Skip writing dependency output and the compilation database itself.
	if (O.getGroup().isValid() && O.getGroup().getID() == options::OPT_M_Group)
	continue;
	if (O.getID() == options::OPT_gen_cdb_fragment_path)
	continue;
	// Skip inputs.
	if (O.getKind() == Option::InputClass)
	continue;
	// All other arguments are quoted and appended.
	ArgStringList ASL;
	A->render(Args, ASL);
	for (auto &it: ASL)
	CDB << ", \"" << escape(it) << "\"";
	}
	Buf = "--target=";
	Buf += Target;
	CDB << ", \"" << escape(Buf) << "\"]},\n";
	}

	void Clang::DumpCompilationDatabaseFragmentToDir(
	StringRef Dir, Compilation &C, StringRef Target, const InputInfo &Output,
	const InputInfo &Input, const llvm::opt::ArgList &Args) const {
	// If this is a dry run, do not create the compilation database file.
	if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
	return;

	if (CompilationDatabase)
	DumpCompilationDatabase(C, "", Target, Output, Input, Args);

	SmallString<256> Path = Dir;
	const auto &Driver = C.getDriver();
	Driver.getVFS().makeAbsolute(Path);
	auto Err = llvm::sys::fs::create_directory(Path, /IgnoreExisting=/true);
	if (Err) {
	Driver.Diag(diag::err_drv_compilationdatabase) << Dir << Err.message();
	return;
	}

	llvm::sys::path::append(
	Path,
	Twine(llvm::sys::path::filename(Input.getFilename())) + ".%%%%.json");
	int FD;
	SmallString<256> TempPath;
	Err = llvm::sys::fs::createUniqueFile(Path, FD, TempPath);
	if (Err) {
	Driver.Diag(diag::err_drv_compilationdatabase) << Path << Err.message();
	return;
	}
	CompilationDatabase =
	std::make_unique<llvm::raw_fd_ostream>(FD, /shouldClose=/true);
	DumpCompilationDatabase(C, "", Target, Output, Input, Args);
	}

	static void CollectArgsForIntegratedAssembler(Compilation &C,
	const ArgList &Args,
	ArgStringList &CmdArgs,
	const Driver &D) {
	if (UseRelaxAll(C, Args))
	CmdArgs.push_back("-mrelax-all");

	// Only default to -mincremental-linker-compatible if we think we are
	// targeting the MSVC linker.
	bool DefaultIncrementalLinkerCompatible =
	C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment();
	if (Args.hasFlag(options::OPT_mincremental_linker_compatible,
	options::OPT_mno_incremental_linker_compatible,
	DefaultIncrementalLinkerCompatible))
	CmdArgs.push_back("-mincremental-linker-compatible");

	switch (C.getDefaultToolChain().getArch()) {
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	if (Arg *A = Args.getLastArg(options::OPT_mimplicit_it_EQ)) {
	StringRef Value = A->getValue();
	if (Value == "always" \|\| Value == "never" \|\| Value == "arm" \|\|
	Value == "thumb") {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-arm-implicit-it=" + Value));
	} else {
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	}
	break;
	default:
	break;
	}

	// If you add more args here, also add them to the block below that
	// starts with "// If CollectArgsForIntegratedAssembler() isn't called below".

	// When passing -I arguments to the assembler we sometimes need to
	// unconditionally take the next argument. For example, when parsing
	// '-Wa,-I -Wa,foo' we need to accept the -Wa,foo arg after seeing the
	// -Wa,-I arg and when parsing '-Wa,-I,foo' we need to accept the 'foo'
	// arg after parsing the '-I' arg.
	bool TakeNextArg = false;

	bool UseRelaxRelocations = C.getDefaultToolChain().useRelaxRelocations();
	bool UseNoExecStack = C.getDefaultToolChain().isNoExecStackDefault();
	const char *MipsTargetFeature = nullptr;
	for (const Arg *A :
	Args.filtered(options::OPT_Wa_COMMA, options::OPT_Xassembler)) {
	A->claim();

	for (StringRef Value : A->getValues()) {
	if (TakeNextArg) {
	CmdArgs.push_back(Value.data());
	TakeNextArg = false;
	continue;
	}

	if (C.getDefaultToolChain().getTriple().isOSBinFormatCOFF() &&
	Value == "-mbig-obj")
	continue; // LLVM handles bigobj automatically

	switch (C.getDefaultToolChain().getArch()) {
	default:
	break;
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	if (Value == "-mthumb")
	// -mthumb has already been processed in ComputeLLVMTriple()
	// recognize but skip over here.
	continue;
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	if (Value == "--trap") {
	CmdArgs.push_back("-target-feature");
	CmdArgs.push_back("+use-tcc-in-div");
	continue;
	}
	if (Value == "--break") {
	CmdArgs.push_back("-target-feature");
	CmdArgs.push_back("-use-tcc-in-div");
	continue;
	}
	if (Value.startswith("-msoft-float")) {
	CmdArgs.push_back("-target-feature");
	CmdArgs.push_back("+soft-float");
	continue;
	}
	if (Value.startswith("-mhard-float")) {
	CmdArgs.push_back("-target-feature");
	CmdArgs.push_back("-soft-float");
	continue;
	}

	MipsTargetFeature = llvm::StringSwitch<const char *>(Value)
	.Case("-mips1", "+mips1")
	.Case("-mips2", "+mips2")
	.Case("-mips3", "+mips3")
	.Case("-mips4", "+mips4")
	.Case("-mips5", "+mips5")
	.Case("-mips32", "+mips32")
	.Case("-mips32r2", "+mips32r2")
	.Case("-mips32r3", "+mips32r3")
	.Case("-mips32r5", "+mips32r5")
	.Case("-mips32r6", "+mips32r6")
	.Case("-mips64", "+mips64")
	.Case("-mips64r2", "+mips64r2")
	.Case("-mips64r3", "+mips64r3")
	.Case("-mips64r5", "+mips64r5")
	.Case("-mips64r6", "+mips64r6")
	.Default(nullptr);
	if (MipsTargetFeature)
	continue;
	}

	if (Value == "-force_cpusubtype_ALL") {
	// Do nothing, this is the default and we don't support anything else.
	} else if (Value == "-L") {
	CmdArgs.push_back("-msave-temp-labels");
	} else if (Value == "--fatal-warnings") {
	CmdArgs.push_back("-massembler-fatal-warnings");
	} else if (Value == "--no-warn" \|\| Value == "-W") {
	CmdArgs.push_back("-massembler-no-warn");
	} else if (Value == "--noexecstack") {
	UseNoExecStack = true;
	} else if (Value.startswith("-compress-debug-sections") \|\|
	Value.startswith("--compress-debug-sections") \|\|
	Value == "-nocompress-debug-sections" \|\|
	Value == "--nocompress-debug-sections") {
	CmdArgs.push_back(Value.data());
	} else if (Value == "-mrelax-relocations=yes" \|\|
	Value == "--mrelax-relocations=yes") {
	UseRelaxRelocations = true;
	} else if (Value == "-mrelax-relocations=no" \|\|
	Value == "--mrelax-relocations=no") {
	UseRelaxRelocations = false;
	} else if (Value.startswith("-I")) {
	CmdArgs.push_back(Value.data());
	// We need to consume the next argument if the current arg is a plain
	// -I. The next arg will be the include directory.
	if (Value == "-I")
	TakeNextArg = true;
	} else if (Value.startswith("-gdwarf-")) {
	// "-gdwarf-N" options are not cc1as options.
	unsigned DwarfVersion = DwarfVersionNum(Value);
	if (DwarfVersion == 0) { // Send it onward, and let cc1as complain.
	CmdArgs.push_back(Value.data());
	} else {
	RenderDebugEnablingArgs(Args, CmdArgs,
	codegenoptions::DebugInfoConstructor,
	DwarfVersion, llvm::DebuggerKind::Default);
	}
	} else if (Value.startswith("-mcpu") \|\| Value.startswith("-mfpu") \|\|
	Value.startswith("-mhwdiv") \|\| Value.startswith("-march")) {
	// Do nothing, we'll validate it later.
	} else if (Value == "-defsym") {
	if (A->getNumValues() != 2) {
	D.Diag(diag::err_drv_defsym_invalid_format) << Value;
	break;
	}
	const char *S = A->getValue(1);
	auto Pair = StringRef(S).split('=');
	auto Sym = Pair.first;
	auto SVal = Pair.second;

	if (Sym.empty() \|\| SVal.empty()) {
	D.Diag(diag::err_drv_defsym_invalid_format) << S;
	break;
	}
	int64_t IVal;
	if (SVal.getAsInteger(0, IVal)) {
	D.Diag(diag::err_drv_defsym_invalid_symval) << SVal;
	break;
	}
	CmdArgs.push_back(Value.data());
	TakeNextArg = true;
	} else if (Value == "-fdebug-compilation-dir") {
	CmdArgs.push_back("-fdebug-compilation-dir");
	TakeNextArg = true;
	} else if (Value.consume_front("-fdebug-compilation-dir=")) {
	// The flag is a -Wa / -Xassembler argument and Options doesn't
	// parse the argument, so this isn't automatically aliased to
	// -fdebug-compilation-dir (without '=') here.
	CmdArgs.push_back("-fdebug-compilation-dir");
	CmdArgs.push_back(Value.data());
	} else {
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	}
	}
	if (UseRelaxRelocations)
	CmdArgs.push_back("--mrelax-relocations");
	if (UseNoExecStack)
	CmdArgs.push_back("-mnoexecstack");
	if (MipsTargetFeature != nullptr) {
	CmdArgs.push_back("-target-feature");
	CmdArgs.push_back(MipsTargetFeature);
	}

	// forward -fembed-bitcode to assmebler
	if (C.getDriver().embedBitcodeEnabled() \|\|
	C.getDriver().embedBitcodeMarkerOnly())
	Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ);
	}

	static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
	bool OFastEnabled, const ArgList &Args,
	ArgStringList &CmdArgs,
	const JobAction &JA) {
	// Handle various floating point optimization flags, mapping them to the
	// appropriate LLVM code generation flags. This is complicated by several
	// "umbrella" flags, so we do this by stepping through the flags incrementally
	// adjusting what we think is enabled/disabled, then at the end setting the
	// LLVM flags based on the final state.
	bool HonorINFs = true;
	bool HonorNaNs = true;
	// -fmath-errno is the default on some platforms, e.g. BSD-derived OSes.
	bool MathErrno = TC.IsMathErrnoDefault();
	bool AssociativeMath = false;
	bool ReciprocalMath = false;
	bool SignedZeros = true;
	bool TrappingMath = false; // Implemented via -ffp-exception-behavior
	bool TrappingMathPresent = false; // Is trapping-math in args, and not
	// overriden by ffp-exception-behavior?
	bool RoundingFPMath = false;
	bool RoundingMathPresent = false; // Is rounding-math in args?
	// -ffp-model values: strict, fast, precise
	StringRef FPModel = "";
	// -ffp-exception-behavior options: strict, maytrap, ignore
	StringRef FPExceptionBehavior = "";
	const llvm::DenormalMode DefaultDenormalFPMath =
	TC.getDefaultDenormalModeForType(Args, JA);
	const llvm::DenormalMode DefaultDenormalFP32Math =
	TC.getDefaultDenormalModeForType(Args, JA, &llvm::APFloat::IEEEsingle());

	llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
	llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
	StringRef FPContract = "";
	bool StrictFPModel = false;


	if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
	CmdArgs.push_back("-mlimit-float-precision");
	CmdArgs.push_back(A->getValue());
	}

	for (const Arg *A : Args) {
	auto optID = A->getOption().getID();
	bool PreciseFPModel = false;
	switch (optID) {
	default:
	break;
	case options::OPT_ffp_model_EQ: {
	// If -ffp-model= is seen, reset to fno-fast-math
	HonorINFs = true;
	HonorNaNs = true;
	// Turning off -ffast-math restores the toolchain default.
	MathErrno = TC.IsMathErrnoDefault();
	AssociativeMath = false;
	ReciprocalMath = false;
	SignedZeros = true;
	// -fno_fast_math restores default denormal and fpcontract handling
	FPContract = "";
	DenormalFPMath = llvm::DenormalMode::getIEEE();

	// FIXME: The target may have picked a non-IEEE default mode here based on
	// -cl-denorms-are-zero. Should the target consider -fp-model interaction?
	DenormalFP32Math = llvm::DenormalMode::getIEEE();

	StringRef Val = A->getValue();
	if (OFastEnabled && !Val.equals("fast")) {
	// Only -ffp-model=fast is compatible with OFast, ignore.
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< Args.MakeArgString("-ffp-model=" + Val)
	<< "-Ofast";
	break;
	}
	StrictFPModel = false;
	PreciseFPModel = true;
	// ffp-model= is a Driver option, it is entirely rewritten into more
	// granular options before being passed into cc1.
	// Use the gcc option in the switch below.
	if (!FPModel.empty() && !FPModel.equals(Val)) {
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< Args.MakeArgString("-ffp-model=" + FPModel)
	<< Args.MakeArgString("-ffp-model=" + Val);
	FPContract = "";
	}
	if (Val.equals("fast")) {
	optID = options::OPT_ffast_math;
	FPModel = Val;
	FPContract = "fast";
	} else if (Val.equals("precise")) {
	optID = options::OPT_ffp_contract;
	FPModel = Val;
	FPContract = "fast";
	PreciseFPModel = true;
	} else if (Val.equals("strict")) {
	StrictFPModel = true;
	optID = options::OPT_frounding_math;
	FPExceptionBehavior = "strict";
	FPModel = Val;
	FPContract = "off";
	TrappingMath = true;
	} else
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Val;
	break;
	}
	}

	switch (optID) {
	// If this isn't an FP option skip the claim below
	default: continue;

	// Options controlling individual features
	case options::OPT_fhonor_infinities: HonorINFs = true; break;
	case options::OPT_fno_honor_infinities: HonorINFs = false; break;
	case options::OPT_fhonor_nans: HonorNaNs = true; break;
	case options::OPT_fno_honor_nans: HonorNaNs = false; break;
	case options::OPT_fmath_errno: MathErrno = true; break;
	case options::OPT_fno_math_errno: MathErrno = false; break;
	case options::OPT_fassociative_math: AssociativeMath = true; break;
	case options::OPT_fno_associative_math: AssociativeMath = false; break;
	case options::OPT_freciprocal_math: ReciprocalMath = true; break;
	case options::OPT_fno_reciprocal_math: ReciprocalMath = false; break;
	case options::OPT_fsigned_zeros: SignedZeros = true; break;
	case options::OPT_fno_signed_zeros: SignedZeros = false; break;
	case options::OPT_ftrapping_math:
	if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
	!FPExceptionBehavior.equals("strict"))
	// Warn that previous value of option is overridden.
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< Args.MakeArgString("-ffp-exception-behavior=" + FPExceptionBehavior)
	<< "-ftrapping-math";
	TrappingMath = true;
	TrappingMathPresent = true;
	FPExceptionBehavior = "strict";
	break;
	case options::OPT_fno_trapping_math:
	if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
	!FPExceptionBehavior.equals("ignore"))
	// Warn that previous value of option is overridden.
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< Args.MakeArgString("-ffp-exception-behavior=" + FPExceptionBehavior)
	<< "-fno-trapping-math";
	TrappingMath = false;
	TrappingMathPresent = true;
	FPExceptionBehavior = "ignore";
	break;

	case options::OPT_frounding_math:
	RoundingFPMath = true;
	RoundingMathPresent = true;
	break;

	case options::OPT_fno_rounding_math:
	RoundingFPMath = false;
	RoundingMathPresent = false;
	break;

	case options::OPT_fdenormal_fp_math_EQ:
	DenormalFPMath = llvm::parseDenormalFPAttribute(A->getValue());
	if (!DenormalFPMath.isValid()) {
	D.Diag(diag::err_drv_invalid_value)
	<< A->getAsString(Args) << A->getValue();
	}
	break;

	case options::OPT_fdenormal_fp_math_f32_EQ:
	DenormalFP32Math = llvm::parseDenormalFPAttribute(A->getValue());
	if (!DenormalFP32Math.isValid()) {
	D.Diag(diag::err_drv_invalid_value)
	<< A->getAsString(Args) << A->getValue();
	}
	break;

	// Validate and pass through -ffp-contract option.
	case options::OPT_ffp_contract: {
	StringRef Val = A->getValue();
	if (PreciseFPModel) {
	// -ffp-model=precise enables ffp-contract=fast as a side effect
	// the FPContract value has already been set to a string literal
	// and the Val string isn't a pertinent value.
	;
	} else if (Val.equals("fast") \|\| Val.equals("on") \|\| Val.equals("off"))
	FPContract = Val;
	else
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Val;
	break;
	}

	// Validate and pass through -ffp-model option.
	case options::OPT_ffp_model_EQ:
	// This should only occur in the error case
	// since the optID has been replaced by a more granular
	// floating point option.
	break;

	// Validate and pass through -ffp-exception-behavior option.
	case options::OPT_ffp_exception_behavior_EQ: {
	StringRef Val = A->getValue();
	if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
	!FPExceptionBehavior.equals(Val))
	// Warn that previous value of option is overridden.
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< Args.MakeArgString("-ffp-exception-behavior=" + FPExceptionBehavior)
	<< Args.MakeArgString("-ffp-exception-behavior=" + Val);
	TrappingMath = TrappingMathPresent = false;
	if (Val.equals("ignore") \|\| Val.equals("maytrap"))
	FPExceptionBehavior = Val;
	else if (Val.equals("strict")) {
	FPExceptionBehavior = Val;
	TrappingMath = TrappingMathPresent = true;
	} else
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Val;
	break;
	}

	case options::OPT_ffinite_math_only:
	HonorINFs = false;
	HonorNaNs = false;
	break;
	case options::OPT_fno_finite_math_only:
	HonorINFs = true;
	HonorNaNs = true;
	break;

	case options::OPT_funsafe_math_optimizations:
	AssociativeMath = true;
	ReciprocalMath = true;
	SignedZeros = false;
	TrappingMath = false;
	FPExceptionBehavior = "";
	break;
	case options::OPT_fno_unsafe_math_optimizations:
	AssociativeMath = false;
	ReciprocalMath = false;
	SignedZeros = true;
	TrappingMath = true;
	FPExceptionBehavior = "strict";

	// The target may have opted to flush by default, so force IEEE.
	DenormalFPMath = llvm::DenormalMode::getIEEE();
	DenormalFP32Math = llvm::DenormalMode::getIEEE();
	break;

	case options::OPT_Ofast:
	// If -Ofast is the optimization level, then -ffast-math should be enabled
	if (!OFastEnabled)
	continue;
	LLVM_FALLTHROUGH;
	case options::OPT_ffast_math:
	HonorINFs = false;
	HonorNaNs = false;
	MathErrno = false;
	AssociativeMath = true;
	ReciprocalMath = true;
	SignedZeros = false;
	TrappingMath = false;
	RoundingFPMath = false;
	// If fast-math is set then set the fp-contract mode to fast.
	FPContract = "fast";
	break;
	case options::OPT_fno_fast_math:
	HonorINFs = true;
	HonorNaNs = true;
	// Turning on -ffast-math (with either flag) removes the need for
	// MathErrno. However, turning off -ffast-math merely restores the
	// toolchain default (which may be false).
	MathErrno = TC.IsMathErrnoDefault();
	AssociativeMath = false;
	ReciprocalMath = false;
	SignedZeros = true;
	TrappingMath = false;
	RoundingFPMath = false;
	// -fno_fast_math restores default denormal and fpcontract handling
	DenormalFPMath = DefaultDenormalFPMath;
	DenormalFP32Math = llvm::DenormalMode::getIEEE();
	FPContract = "";
	break;
	}
	if (StrictFPModel) {
	// If -ffp-model=strict has been specified on command line but
	// subsequent options conflict then emit warning diagnostic.
	if (HonorINFs && HonorNaNs &&
	!AssociativeMath && !ReciprocalMath &&
	SignedZeros && TrappingMath && RoundingFPMath &&
	(FPContract.equals("off") \|\| FPContract.empty()) &&
	DenormalFPMath == llvm::DenormalMode::getIEEE() &&
	DenormalFP32Math == llvm::DenormalMode::getIEEE())
	// OK: Current Arg doesn't conflict with -ffp-model=strict
	;
	else {
	StrictFPModel = false;
	FPModel = "";
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< "-ffp-model=strict" <<
	((A->getNumValues() == 0) ? A->getSpelling()
	: Args.MakeArgString(A->getSpelling() + A->getValue()));
	}
	}

	// If we handled this option claim it
	A->claim();
	}

	if (!HonorINFs)
	CmdArgs.push_back("-menable-no-infs");

	if (!HonorNaNs)
	CmdArgs.push_back("-menable-no-nans");

	if (MathErrno)
	CmdArgs.push_back("-fmath-errno");

	if (!MathErrno && AssociativeMath && ReciprocalMath && !SignedZeros &&
	!TrappingMath)
	CmdArgs.push_back("-menable-unsafe-fp-math");

	if (!SignedZeros)
	CmdArgs.push_back("-fno-signed-zeros");

	if (AssociativeMath && !SignedZeros && !TrappingMath)
	CmdArgs.push_back("-mreassociate");

	if (ReciprocalMath)
	CmdArgs.push_back("-freciprocal-math");

	if (TrappingMath) {
	// FP Exception Behavior is also set to strict
	assert(FPExceptionBehavior.equals("strict"));
	CmdArgs.push_back("-ftrapping-math");
	} else if (TrappingMathPresent)
	CmdArgs.push_back("-fno-trapping-math");

	// The default is IEEE.
	if (DenormalFPMath != llvm::DenormalMode::getIEEE()) {
	llvm::SmallString<64> DenormFlag;
	llvm::raw_svector_ostream ArgStr(DenormFlag);
	ArgStr << "-fdenormal-fp-math=" << DenormalFPMath;
	CmdArgs.push_back(Args.MakeArgString(ArgStr.str()));
	}

	// Add f32 specific denormal mode flag if it's different.
	if (DenormalFP32Math != DenormalFPMath) {
	llvm::SmallString<64> DenormFlag;
	llvm::raw_svector_ostream ArgStr(DenormFlag);
	ArgStr << "-fdenormal-fp-math-f32=" << DenormalFP32Math;
	CmdArgs.push_back(Args.MakeArgString(ArgStr.str()));
	}

	if (!FPContract.empty())
	CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract));

	if (!RoundingFPMath)
	CmdArgs.push_back(Args.MakeArgString("-fno-rounding-math"));

	if (RoundingFPMath && RoundingMathPresent)
	CmdArgs.push_back(Args.MakeArgString("-frounding-math"));

	if (!FPExceptionBehavior.empty())
	CmdArgs.push_back(Args.MakeArgString("-ffp-exception-behavior=" +
	FPExceptionBehavior));

	ParseMRecip(D, Args, CmdArgs);

	// -ffast-math enables the __FAST_MATH__ preprocessor macro, but check for the
	// individual features enabled by -ffast-math instead of the option itself as
	// that's consistent with gcc's behaviour.
	if (!HonorINFs && !HonorNaNs && !MathErrno && AssociativeMath &&
	ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath) {
	CmdArgs.push_back("-ffast-math");
	if (FPModel.equals("fast")) {
	if (FPContract.equals("fast"))
	// All set, do nothing.
	;
	else if (FPContract.empty())
	// Enable -ffp-contract=fast
	CmdArgs.push_back(Args.MakeArgString("-ffp-contract=fast"));
	else
	D.Diag(clang::diag::warn_drv_overriding_flag_option)
	<< "-ffp-model=fast"
	<< Args.MakeArgString("-ffp-contract=" + FPContract);
	}
	}

	// Handle __FINITE_MATH_ONLY__ similarly.
	if (!HonorINFs && !HonorNaNs)
	CmdArgs.push_back("-ffinite-math-only");

	if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) {
	CmdArgs.push_back("-mfpmath");
	CmdArgs.push_back(A->getValue());
	}

	// Disable a codegen optimization for floating-point casts.
	if (Args.hasFlag(options::OPT_fno_strict_float_cast_overflow,
	options::OPT_fstrict_float_cast_overflow, false))
	CmdArgs.push_back("-fno-strict-float-cast-overflow");
	}

	static void RenderAnalyzerOptions(const ArgList &Args, ArgStringList &CmdArgs,
	const llvm::Triple &Triple,
	const InputInfo &Input) {
	// Enable region store model by default.
	CmdArgs.push_back("-analyzer-store=region");

	// Treat blocks as analysis entry points.
	CmdArgs.push_back("-analyzer-opt-analyze-nested-blocks");

	// Add default argument set.
	if (!Args.hasArg(options::OPT__analyzer_no_default_checks)) {
	CmdArgs.push_back("-analyzer-checker=core");
	CmdArgs.push_back("-analyzer-checker=apiModeling");

	if (!Triple.isWindowsMSVCEnvironment()) {
	CmdArgs.push_back("-analyzer-checker=unix");
	} else {
	// Enable "unix" checkers that also work on Windows.
	CmdArgs.push_back("-analyzer-checker=unix.API");
	CmdArgs.push_back("-analyzer-checker=unix.Malloc");
	CmdArgs.push_back("-analyzer-checker=unix.MallocSizeof");
	CmdArgs.push_back("-analyzer-checker=unix.MismatchedDeallocator");
	CmdArgs.push_back("-analyzer-checker=unix.cstring.BadSizeArg");
	CmdArgs.push_back("-analyzer-checker=unix.cstring.NullArg");
	}

	// Disable some unix checkers for PS4.
	if (Triple.isPS4CPU()) {
	CmdArgs.push_back("-analyzer-disable-checker=unix.API");
	CmdArgs.push_back("-analyzer-disable-checker=unix.Vfork");
	}

	if (Triple.isOSDarwin()) {
	CmdArgs.push_back("-analyzer-checker=osx");
	CmdArgs.push_back(
	"-analyzer-checker=security.insecureAPI.decodeValueOfObjCType");
	}
	else if (Triple.isOSFuchsia())
	CmdArgs.push_back("-analyzer-checker=fuchsia");

	CmdArgs.push_back("-analyzer-checker=deadcode");

	if (types::isCXX(Input.getType()))
	CmdArgs.push_back("-analyzer-checker=cplusplus");

	if (!Triple.isPS4CPU()) {
	CmdArgs.push_back("-analyzer-checker=security.insecureAPI.UncheckedReturn");
	CmdArgs.push_back("-analyzer-checker=security.insecureAPI.getpw");
	CmdArgs.push_back("-analyzer-checker=security.insecureAPI.gets");
	CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mktemp");
	CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mkstemp");
	CmdArgs.push_back("-analyzer-checker=security.insecureAPI.vfork");
	}

	// Default nullability checks.
	CmdArgs.push_back("-analyzer-checker=nullability.NullPassedToNonnull");
	CmdArgs.push_back("-analyzer-checker=nullability.NullReturnedFromNonnull");
	}

	// Set the output format. The default is plist, for (lame) historical reasons.
	CmdArgs.push_back("-analyzer-output");
	if (Arg *A = Args.getLastArg(options::OPT__analyzer_output))
	CmdArgs.push_back(A->getValue());
	else
	CmdArgs.push_back("plist");

	// Disable the presentation of standard compiler warnings when using
	// --analyze. We only want to show static analyzer diagnostics or frontend
	// errors.
	CmdArgs.push_back("-w");

	// Add -Xanalyzer arguments when running as analyzer.
	Args.AddAllArgValues(CmdArgs, options::OPT_Xanalyzer);
	}

	static void RenderSSPOptions(const ToolChain &TC, const ArgList &Args,
	ArgStringList &CmdArgs, bool KernelOrKext) {
	const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple();

	// NVPTX doesn't support stack protectors; from the compiler's perspective, it
	// doesn't even have a stack!
	if (EffectiveTriple.isNVPTX())
	return;

	// -stack-protector=0 is default.
	unsigned StackProtectorLevel = 0;
	unsigned DefaultStackProtectorLevel =
	TC.GetDefaultStackProtectorLevel(KernelOrKext);

	if (Arg *A = Args.getLastArg(options::OPT_fno_stack_protector,
	options::OPT_fstack_protector_all,
	options::OPT_fstack_protector_strong,
	options::OPT_fstack_protector)) {
	if (A->getOption().matches(options::OPT_fstack_protector))
	StackProtectorLevel =
	std::max<unsigned>(LangOptions::SSPOn, DefaultStackProtectorLevel);
	else if (A->getOption().matches(options::OPT_fstack_protector_strong))
	StackProtectorLevel = LangOptions::SSPStrong;
	else if (A->getOption().matches(options::OPT_fstack_protector_all))
	StackProtectorLevel = LangOptions::SSPReq;
	} else {
	StackProtectorLevel = DefaultStackProtectorLevel;
	}

	if (StackProtectorLevel) {
	CmdArgs.push_back("-stack-protector");
	CmdArgs.push_back(Args.MakeArgString(Twine(StackProtectorLevel)));
	}

	// --param ssp-buffer-size=
	for (const Arg *A : Args.filtered(options::OPT__param)) {
	StringRef Str(A->getValue());
	if (Str.startswith("ssp-buffer-size=")) {
	if (StackProtectorLevel) {
	CmdArgs.push_back("-stack-protector-buffer-size");
	// FIXME: Verify the argument is a valid integer.
	CmdArgs.push_back(Args.MakeArgString(Str.drop_front(16)));
	}
	A->claim();
	}
	}
	}

	static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
	ArgStringList &CmdArgs) {
	const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple();

	if (!EffectiveTriple.isOSLinux())
	return;

	if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() &&
	!EffectiveTriple.isPPC64())
	return;

	if (Args.hasFlag(options::OPT_fstack_clash_protection,
	- options::OPT_fnostack_clash_protection, false))
	+ options::OPT_fno_stack_clash_protection, false))
	CmdArgs.push_back("-fstack-clash-protection");
	}

	static void RenderTrivialAutoVarInitOptions(const Driver &D,
	const ToolChain &TC,
	const ArgList &Args,
	ArgStringList &CmdArgs) {
	auto DefaultTrivialAutoVarInit = TC.GetDefaultTrivialAutoVarInit();
	StringRef TrivialAutoVarInit = "";

	for (const Arg *A : Args) {
	switch (A->getOption().getID()) {
	default:
	continue;
	case options::OPT_ftrivial_auto_var_init: {
	A->claim();
	StringRef Val = A->getValue();
	if (Val == "uninitialized" \|\| Val == "zero" \|\| Val == "pattern")
	TrivialAutoVarInit = Val;
	else
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Val;
	break;
	}
	}
	}

	if (TrivialAutoVarInit.empty())
	switch (DefaultTrivialAutoVarInit) {
	case LangOptions::TrivialAutoVarInitKind::Uninitialized:
	break;
	case LangOptions::TrivialAutoVarInitKind::Pattern:
	TrivialAutoVarInit = "pattern";
	break;
	case LangOptions::TrivialAutoVarInitKind::Zero:
	TrivialAutoVarInit = "zero";
	break;
	}

	if (!TrivialAutoVarInit.empty()) {
	if (TrivialAutoVarInit == "zero" && !Args.hasArg(options::OPT_enable_trivial_var_init_zero))
	D.Diag(diag::err_drv_trivial_auto_var_init_zero_disabled);
	CmdArgs.push_back(
	Args.MakeArgString("-ftrivial-auto-var-init=" + TrivialAutoVarInit));
	}

	if (Arg *A =
	Args.getLastArg(options::OPT_ftrivial_auto_var_init_stop_after)) {
	if (!Args.hasArg(options::OPT_ftrivial_auto_var_init) \|\|
	StringRef(
	Args.getLastArg(options::OPT_ftrivial_auto_var_init)->getValue()) ==
	"uninitialized")
	D.Diag(diag::err_drv_trivial_auto_var_init_stop_after_missing_dependency);
	A->claim();
	StringRef Val = A->getValue();
	if (std::stoi(Val.str()) <= 0)
	D.Diag(diag::err_drv_trivial_auto_var_init_stop_after_invalid_value);
	CmdArgs.push_back(
	Args.MakeArgString("-ftrivial-auto-var-init-stop-after=" + Val));
	}
	}

	static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) {
	// cl-denorms-are-zero is not forwarded. It is translated into a generic flag
	// for denormal flushing handling based on the target.
	const unsigned ForwardedArguments[] = {
	options::OPT_cl_opt_disable,
	options::OPT_cl_strict_aliasing,
	options::OPT_cl_single_precision_constant,
	options::OPT_cl_finite_math_only,
	options::OPT_cl_kernel_arg_info,
	options::OPT_cl_unsafe_math_optimizations,
	options::OPT_cl_fast_relaxed_math,
	options::OPT_cl_mad_enable,
	options::OPT_cl_no_signed_zeros,
	options::OPT_cl_fp32_correctly_rounded_divide_sqrt,
	options::OPT_cl_uniform_work_group_size
	};

	if (Arg *A = Args.getLastArg(options::OPT_cl_std_EQ)) {
	std::string CLStdStr = std::string("-cl-std=") + A->getValue();
	CmdArgs.push_back(Args.MakeArgString(CLStdStr));
	}

	for (const auto &Arg : ForwardedArguments)
	if (const auto *A = Args.getLastArg(Arg))
	CmdArgs.push_back(Args.MakeArgString(A->getOption().getPrefixedName()));
	}

	static void RenderARCMigrateToolOptions(const Driver &D, const ArgList &Args,
	ArgStringList &CmdArgs) {
	bool ARCMTEnabled = false;
	if (!Args.hasArg(options::OPT_fno_objc_arc, options::OPT_fobjc_arc)) {
	if (const Arg *A = Args.getLastArg(options::OPT_ccc_arcmt_check,
	options::OPT_ccc_arcmt_modify,
	options::OPT_ccc_arcmt_migrate)) {
	ARCMTEnabled = true;
	switch (A->getOption().getID()) {
	default: llvm_unreachable("missed a case");
	case options::OPT_ccc_arcmt_check:
	CmdArgs.push_back("-arcmt-check");
	break;
	case options::OPT_ccc_arcmt_modify:
	CmdArgs.push_back("-arcmt-modify");
	break;
	case options::OPT_ccc_arcmt_migrate:
	CmdArgs.push_back("-arcmt-migrate");
	CmdArgs.push_back("-mt-migrate-directory");
	CmdArgs.push_back(A->getValue());

	Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_report_output);
	Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_emit_arc_errors);
	break;
	}
	}
	} else {
	Args.ClaimAllArgs(options::OPT_ccc_arcmt_check);
	Args.ClaimAllArgs(options::OPT_ccc_arcmt_modify);
	Args.ClaimAllArgs(options::OPT_ccc_arcmt_migrate);
	}

	if (const Arg *A = Args.getLastArg(options::OPT_ccc_objcmt_migrate)) {
	if (ARCMTEnabled)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< A->getAsString(Args) << "-ccc-arcmt-migrate";

	CmdArgs.push_back("-mt-migrate-directory");
	CmdArgs.push_back(A->getValue());

	if (!Args.hasArg(options::OPT_objcmt_migrate_literals,
	options::OPT_objcmt_migrate_subscripting,
	options::OPT_objcmt_migrate_property)) {
	// None specified, means enable them all.
	CmdArgs.push_back("-objcmt-migrate-literals");
	CmdArgs.push_back("-objcmt-migrate-subscripting");
	CmdArgs.push_back("-objcmt-migrate-property");
	} else {
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_literals);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_subscripting);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_property);
	}
	} else {
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_literals);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_subscripting);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_property);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_all);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_readonly_property);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_readwrite_property);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_property_dot_syntax);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_annotation);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_instancetype);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_nsmacros);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_protocol_conformance);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_atomic_property);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_returns_innerpointer_property);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_ns_nonatomic_iosonly);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_designated_init);
	Args.AddLastArg(CmdArgs, options::OPT_objcmt_whitelist_dir_path);
	}
	}

	static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T,
	const ArgList &Args, ArgStringList &CmdArgs) {
	// -fbuiltin is default unless -mkernel is used.
	bool UseBuiltins =
	Args.hasFlag(options::OPT_fbuiltin, options::OPT_fno_builtin,
	!Args.hasArg(options::OPT_mkernel));
	if (!UseBuiltins)
	CmdArgs.push_back("-fno-builtin");

	// -ffreestanding implies -fno-builtin.
	if (Args.hasArg(options::OPT_ffreestanding))
	UseBuiltins = false;

	// Process the -fno-builtin-* options.
	for (const auto &Arg : Args) {
	const Option &O = Arg->getOption();
	if (!O.matches(options::OPT_fno_builtin_))
	continue;

	Arg->claim();

	// If -fno-builtin is specified, then there's no need to pass the option to
	// the frontend.
	if (!UseBuiltins)
	continue;

	StringRef FuncName = Arg->getValue();
	CmdArgs.push_back(Args.MakeArgString("-fno-builtin-" + FuncName));
	}

	// le32-specific flags:
	// -fno-math-builtin: clang should not convert math builtins to intrinsics
	// by default.
	if (TC.getArch() == llvm::Triple::le32)
	CmdArgs.push_back("-fno-math-builtin");
	}

	bool Driver::getDefaultModuleCachePath(SmallVectorImpl<char> &Result) {
	if (llvm::sys::path::cache_directory(Result)) {
	llvm::sys::path::append(Result, "clang");
	llvm::sys::path::append(Result, "ModuleCache");
	return true;
	}
	return false;
	}

	static void RenderModulesOptions(Compilation &C, const Driver &D,
	const ArgList &Args, const InputInfo &Input,
	const InputInfo &Output,
	ArgStringList &CmdArgs, bool &HaveModules) {
	// -fmodules enables the use of precompiled modules (off by default).
	// Users can pass -fno-cxx-modules to turn off modules support for
	// C++/Objective-C++ programs.
	bool HaveClangModules = false;
	if (Args.hasFlag(options::OPT_fmodules, options::OPT_fno_modules, false)) {
	bool AllowedInCXX = Args.hasFlag(options::OPT_fcxx_modules,
	options::OPT_fno_cxx_modules, true);
	if (AllowedInCXX \|\| !types::isCXX(Input.getType())) {
	CmdArgs.push_back("-fmodules");
	HaveClangModules = true;
	}
	}

	HaveModules \|= HaveClangModules;
	if (Args.hasArg(options::OPT_fmodules_ts)) {
	CmdArgs.push_back("-fmodules-ts");
	HaveModules = true;
	}

	// -fmodule-maps enables implicit reading of module map files. By default,
	// this is enabled if we are using Clang's flavor of precompiled modules.
	if (Args.hasFlag(options::OPT_fimplicit_module_maps,
	options::OPT_fno_implicit_module_maps, HaveClangModules))
	CmdArgs.push_back("-fimplicit-module-maps");

	// -fmodules-decluse checks that modules used are declared so (off by default)
	if (Args.hasFlag(options::OPT_fmodules_decluse,
	options::OPT_fno_modules_decluse, false))
	CmdArgs.push_back("-fmodules-decluse");

	// -fmodules-strict-decluse is like -fmodule-decluse, but also checks that
	// all #included headers are part of modules.
	if (Args.hasFlag(options::OPT_fmodules_strict_decluse,
	options::OPT_fno_modules_strict_decluse, false))
	CmdArgs.push_back("-fmodules-strict-decluse");

	// -fno-implicit-modules turns off implicitly compiling modules on demand.
	bool ImplicitModules = false;
	if (!Args.hasFlag(options::OPT_fimplicit_modules,
	options::OPT_fno_implicit_modules, HaveClangModules)) {
	if (HaveModules)
	CmdArgs.push_back("-fno-implicit-modules");
	} else if (HaveModules) {
	ImplicitModules = true;
	// -fmodule-cache-path specifies where our implicitly-built module files
	// should be written.
	SmallString<128> Path;
	if (Arg *A = Args.getLastArg(options::OPT_fmodules_cache_path))
	Path = A->getValue();

	bool HasPath = true;
	if (C.isForDiagnostics()) {
	// When generating crash reports, we want to emit the modules along with
	// the reproduction sources, so we ignore any provided module path.
	Path = Output.getFilename();
	llvm::sys::path::replace_extension(Path, ".cache");
	llvm::sys::path::append(Path, "modules");
	} else if (Path.empty()) {
	// No module path was provided: use the default.
	HasPath = Driver::getDefaultModuleCachePath(Path);
	}

	// `HasPath` will only be false if getDefaultModuleCachePath() fails.
	// That being said, that failure is unlikely and not caching is harmless.
	if (HasPath) {
	const char Arg[] = "-fmodules-cache-path=";
	Path.insert(Path.begin(), Arg, Arg + strlen(Arg));
	CmdArgs.push_back(Args.MakeArgString(Path));
	}
	}

	if (HaveModules) {
	// -fprebuilt-module-path specifies where to load the prebuilt module files.
	for (const Arg *A : Args.filtered(options::OPT_fprebuilt_module_path)) {
	CmdArgs.push_back(Args.MakeArgString(
	std::string("-fprebuilt-module-path=") + A->getValue()));
	A->claim();
	}
	if (Args.hasFlag(options::OPT_fmodules_validate_input_files_content,
	options::OPT_fno_modules_validate_input_files_content,
	false))
	CmdArgs.push_back("-fvalidate-ast-input-files-content");
	}

	// -fmodule-name specifies the module that is currently being built (or
	// used for header checking by -fmodule-maps).
	Args.AddLastArg(CmdArgs, options::OPT_fmodule_name_EQ);

	// -fmodule-map-file can be used to specify files containing module
	// definitions.
	Args.AddAllArgs(CmdArgs, options::OPT_fmodule_map_file);

	// -fbuiltin-module-map can be used to load the clang
	// builtin headers modulemap file.
	if (Args.hasArg(options::OPT_fbuiltin_module_map)) {
	SmallString<128> BuiltinModuleMap(D.ResourceDir);
	llvm::sys::path::append(BuiltinModuleMap, "include");
	llvm::sys::path::append(BuiltinModuleMap, "module.modulemap");
	if (llvm::sys::fs::exists(BuiltinModuleMap))
	CmdArgs.push_back(
	Args.MakeArgString("-fmodule-map-file=" + BuiltinModuleMap));
	}

	// The -fmodule-file=<name>=<file> form specifies the mapping of module
	// names to precompiled module files (the module is loaded only if used).
	// The -fmodule-file=<file> form can be used to unconditionally load
	// precompiled module files (whether used or not).
	if (HaveModules)
	Args.AddAllArgs(CmdArgs, options::OPT_fmodule_file);
	else
	Args.ClaimAllArgs(options::OPT_fmodule_file);

	// When building modules and generating crashdumps, we need to dump a module
	// dependency VFS alongside the output.
	if (HaveClangModules && C.isForDiagnostics()) {
	SmallString<128> VFSDir(Output.getFilename());
	llvm::sys::path::replace_extension(VFSDir, ".cache");
	// Add the cache directory as a temp so the crash diagnostics pick it up.
	C.addTempFile(Args.MakeArgString(VFSDir));

	llvm::sys::path::append(VFSDir, "vfs");
	CmdArgs.push_back("-module-dependency-dir");
	CmdArgs.push_back(Args.MakeArgString(VFSDir));
	}

	if (HaveClangModules)
	Args.AddLastArg(CmdArgs, options::OPT_fmodules_user_build_path);

	// Pass through all -fmodules-ignore-macro arguments.
	Args.AddAllArgs(CmdArgs, options::OPT_fmodules_ignore_macro);
	Args.AddLastArg(CmdArgs, options::OPT_fmodules_prune_interval);
	Args.AddLastArg(CmdArgs, options::OPT_fmodules_prune_after);

	Args.AddLastArg(CmdArgs, options::OPT_fbuild_session_timestamp);

	if (Arg *A = Args.getLastArg(options::OPT_fbuild_session_file)) {
	if (Args.hasArg(options::OPT_fbuild_session_timestamp))
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< A->getAsString(Args) << "-fbuild-session-timestamp";

	llvm::sys::fs::file_status Status;
	if (llvm::sys::fs::status(A->getValue(), Status))
	D.Diag(diag::err_drv_no_such_file) << A->getValue();
	CmdArgs.push_back(
	Args.MakeArgString("-fbuild-session-timestamp=" +
	Twine((uint64_t)Status.getLastModificationTime()
	.time_since_epoch()
	.count())));
	}

	if (Args.getLastArg(options::OPT_fmodules_validate_once_per_build_session)) {
	if (!Args.getLastArg(options::OPT_fbuild_session_timestamp,
	options::OPT_fbuild_session_file))
	D.Diag(diag::err_drv_modules_validate_once_requires_timestamp);

	Args.AddLastArg(CmdArgs,
	options::OPT_fmodules_validate_once_per_build_session);
	}

	if (Args.hasFlag(options::OPT_fmodules_validate_system_headers,
	options::OPT_fno_modules_validate_system_headers,
	ImplicitModules))
	CmdArgs.push_back("-fmodules-validate-system-headers");

	Args.AddLastArg(CmdArgs, options::OPT_fmodules_disable_diagnostic_validation);
	}

	static void RenderCharacterOptions(const ArgList &Args, const llvm::Triple &T,
	ArgStringList &CmdArgs) {
	// -fsigned-char is default.
	if (const Arg *A = Args.getLastArg(options::OPT_fsigned_char,
	options::OPT_fno_signed_char,
	options::OPT_funsigned_char,
	options::OPT_fno_unsigned_char)) {
	if (A->getOption().matches(options::OPT_funsigned_char) \|\|
	A->getOption().matches(options::OPT_fno_signed_char)) {
	CmdArgs.push_back("-fno-signed-char");
	}
	} else if (!isSignedCharDefault(T)) {
	CmdArgs.push_back("-fno-signed-char");
	}

	// The default depends on the language standard.
	Args.AddLastArg(CmdArgs, options::OPT_fchar8__t, options::OPT_fno_char8__t);

	if (const Arg *A = Args.getLastArg(options::OPT_fshort_wchar,
	options::OPT_fno_short_wchar)) {
	if (A->getOption().matches(options::OPT_fshort_wchar)) {
	CmdArgs.push_back("-fwchar-type=short");
	CmdArgs.push_back("-fno-signed-wchar");
	} else {
	bool IsARM = T.isARM() \|\| T.isThumb() \|\| T.isAArch64();
	CmdArgs.push_back("-fwchar-type=int");
	if (IsARM && !(T.isOSWindows() \|\| T.isOSNetBSD() \|\|
	T.isOSOpenBSD()))
	CmdArgs.push_back("-fno-signed-wchar");
	else
	CmdArgs.push_back("-fsigned-wchar");
	}
	}
	}

	static void RenderObjCOptions(const ToolChain &TC, const Driver &D,
	const llvm::Triple &T, const ArgList &Args,
	ObjCRuntime &Runtime, bool InferCovariantReturns,
	const InputInfo &Input, ArgStringList &CmdArgs) {
	const llvm::Triple::ArchType Arch = TC.getArch();

	// -fobjc-dispatch-method is only relevant with the nonfragile-abi, and legacy
	// is the default. Except for deployment target of 10.5, next runtime is
	// always legacy dispatch and -fno-objc-legacy-dispatch gets ignored silently.
	if (Runtime.isNonFragile()) {
	if (!Args.hasFlag(options::OPT_fobjc_legacy_dispatch,
	options::OPT_fno_objc_legacy_dispatch,
	Runtime.isLegacyDispatchDefaultForArch(Arch))) {
	if (TC.UseObjCMixedDispatch())
	CmdArgs.push_back("-fobjc-dispatch-method=mixed");
	else
	CmdArgs.push_back("-fobjc-dispatch-method=non-legacy");
	}
	}

	// When ObjectiveC legacy runtime is in effect on MacOSX, turn on the option
	// to do Array/Dictionary subscripting by default.
	if (Arch == llvm::Triple::x86 && T.isMacOSX() &&
	Runtime.getKind() == ObjCRuntime::FragileMacOSX && Runtime.isNeXTFamily())
	CmdArgs.push_back("-fobjc-subscripting-legacy-runtime");

	// Allow -fno-objc-arr to trump -fobjc-arr/-fobjc-arc.
	// NOTE: This logic is duplicated in ToolChains.cpp.
	if (isObjCAutoRefCount(Args)) {
	TC.CheckObjCARC();

	CmdArgs.push_back("-fobjc-arc");

	// FIXME: It seems like this entire block, and several around it should be
	// wrapped in isObjC, but for now we just use it here as this is where it
	// was being used previously.
	if (types::isCXX(Input.getType()) && types::isObjC(Input.getType())) {
	if (TC.GetCXXStdlibType(Args) == ToolChain::CST_Libcxx)
	CmdArgs.push_back("-fobjc-arc-cxxlib=libc++");
	else
	CmdArgs.push_back("-fobjc-arc-cxxlib=libstdc++");
	}

	// Allow the user to enable full exceptions code emission.
	// We default off for Objective-C, on for Objective-C++.
	if (Args.hasFlag(options::OPT_fobjc_arc_exceptions,
	options::OPT_fno_objc_arc_exceptions,
	/Default=/types::isCXX(Input.getType())))
	CmdArgs.push_back("-fobjc-arc-exceptions");
	}

	// Silence warning for full exception code emission options when explicitly
	// set to use no ARC.
	if (Args.hasArg(options::OPT_fno_objc_arc)) {
	Args.ClaimAllArgs(options::OPT_fobjc_arc_exceptions);
	Args.ClaimAllArgs(options::OPT_fno_objc_arc_exceptions);
	}

	// Allow the user to control whether messages can be converted to runtime
	// functions.
	if (types::isObjC(Input.getType())) {
	auto *Arg = Args.getLastArg(
	options::OPT_fobjc_convert_messages_to_runtime_calls,
	options::OPT_fno_objc_convert_messages_to_runtime_calls);
	if (Arg &&
	Arg->getOption().matches(
	options::OPT_fno_objc_convert_messages_to_runtime_calls))
	CmdArgs.push_back("-fno-objc-convert-messages-to-runtime-calls");
	}

	// -fobjc-infer-related-result-type is the default, except in the Objective-C
	// rewriter.
	if (InferCovariantReturns)
	CmdArgs.push_back("-fno-objc-infer-related-result-type");

	// Pass down -fobjc-weak or -fno-objc-weak if present.
	if (types::isObjC(Input.getType())) {
	auto WeakArg =
	Args.getLastArg(options::OPT_fobjc_weak, options::OPT_fno_objc_weak);
	if (!WeakArg) {
	// nothing to do
	} else if (!Runtime.allowsWeak()) {
	if (WeakArg->getOption().matches(options::OPT_fobjc_weak))
	D.Diag(diag::err_objc_weak_unsupported);
	} else {
	WeakArg->render(Args, CmdArgs);
	}
	}
	}

	static void RenderDiagnosticsOptions(const Driver &D, const ArgList &Args,
	ArgStringList &CmdArgs) {
	bool CaretDefault = true;
	bool ColumnDefault = true;

	if (const Arg *A = Args.getLastArg(options::OPT__SLASH_diagnostics_classic,
	options::OPT__SLASH_diagnostics_column,
	options::OPT__SLASH_diagnostics_caret)) {
	switch (A->getOption().getID()) {
	case options::OPT__SLASH_diagnostics_caret:
	CaretDefault = true;
	ColumnDefault = true;
	break;
	case options::OPT__SLASH_diagnostics_column:
	CaretDefault = false;
	ColumnDefault = true;
	break;
	case options::OPT__SLASH_diagnostics_classic:
	CaretDefault = false;
	ColumnDefault = false;
	break;
	}
	}

	// -fcaret-diagnostics is default.
	if (!Args.hasFlag(options::OPT_fcaret_diagnostics,
	options::OPT_fno_caret_diagnostics, CaretDefault))
	CmdArgs.push_back("-fno-caret-diagnostics");

	// -fdiagnostics-fixit-info is default, only pass non-default.
	if (!Args.hasFlag(options::OPT_fdiagnostics_fixit_info,
	options::OPT_fno_diagnostics_fixit_info))
	CmdArgs.push_back("-fno-diagnostics-fixit-info");

	// Enable -fdiagnostics-show-option by default.
	if (!Args.hasFlag(options::OPT_fdiagnostics_show_option,
	options::OPT_fno_diagnostics_show_option, true))
	CmdArgs.push_back("-fno-diagnostics-show-option");

	if (const Arg *A =
	Args.getLastArg(options::OPT_fdiagnostics_show_category_EQ)) {
	CmdArgs.push_back("-fdiagnostics-show-category");
	CmdArgs.push_back(A->getValue());
	}

	if (Args.hasFlag(options::OPT_fdiagnostics_show_hotness,
	options::OPT_fno_diagnostics_show_hotness, false))
	CmdArgs.push_back("-fdiagnostics-show-hotness");

	if (const Arg *A =
	Args.getLastArg(options::OPT_fdiagnostics_hotness_threshold_EQ)) {
	std::string Opt =
	std::string("-fdiagnostics-hotness-threshold=") + A->getValue();
	CmdArgs.push_back(Args.MakeArgString(Opt));
	}

	if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_format_EQ)) {
	CmdArgs.push_back("-fdiagnostics-format");
	CmdArgs.push_back(A->getValue());
	}

	if (const Arg *A = Args.getLastArg(
	options::OPT_fdiagnostics_show_note_include_stack,
	options::OPT_fno_diagnostics_show_note_include_stack)) {
	const Option &O = A->getOption();
	if (O.matches(options::OPT_fdiagnostics_show_note_include_stack))
	CmdArgs.push_back("-fdiagnostics-show-note-include-stack");
	else
	CmdArgs.push_back("-fno-diagnostics-show-note-include-stack");
	}

	// Color diagnostics are parsed by the driver directly from argv and later
	// re-parsed to construct this job; claim any possible color diagnostic here
	// to avoid warn_drv_unused_argument and diagnose bad
	// OPT_fdiagnostics_color_EQ values.
	for (const Arg *A : Args) {
	const Option &O = A->getOption();
	if (!O.matches(options::OPT_fcolor_diagnostics) &&
	!O.matches(options::OPT_fdiagnostics_color) &&
	!O.matches(options::OPT_fno_color_diagnostics) &&
	!O.matches(options::OPT_fno_diagnostics_color) &&
	!O.matches(options::OPT_fdiagnostics_color_EQ))
	continue;

	if (O.matches(options::OPT_fdiagnostics_color_EQ)) {
	StringRef Value(A->getValue());
	if (Value != "always" && Value != "never" && Value != "auto")
	D.Diag(diag::err_drv_clang_unsupported)
	<< ("-fdiagnostics-color=" + Value).str();
	}
	A->claim();
	}

	if (D.getDiags().getDiagnosticOptions().ShowColors)
	CmdArgs.push_back("-fcolor-diagnostics");

	if (Args.hasArg(options::OPT_fansi_escape_codes))
	CmdArgs.push_back("-fansi-escape-codes");

	if (!Args.hasFlag(options::OPT_fshow_source_location,
	options::OPT_fno_show_source_location))
	CmdArgs.push_back("-fno-show-source-location");

	if (Args.hasArg(options::OPT_fdiagnostics_absolute_paths))
	CmdArgs.push_back("-fdiagnostics-absolute-paths");

	if (!Args.hasFlag(options::OPT_fshow_column, options::OPT_fno_show_column,
	ColumnDefault))
	CmdArgs.push_back("-fno-show-column");

	if (!Args.hasFlag(options::OPT_fspell_checking,
	options::OPT_fno_spell_checking))
	CmdArgs.push_back("-fno-spell-checking");
	}

	enum class DwarfFissionKind { None, Split, Single };

	static DwarfFissionKind getDebugFissionKind(const Driver &D,
	const ArgList &Args, Arg *&Arg) {
	Arg =
	Args.getLastArg(options::OPT_gsplit_dwarf, options::OPT_gsplit_dwarf_EQ);
	if (!Arg)
	return DwarfFissionKind::None;

	if (Arg->getOption().matches(options::OPT_gsplit_dwarf))
	return DwarfFissionKind::Split;

	StringRef Value = Arg->getValue();
	if (Value == "split")
	return DwarfFissionKind::Split;
	if (Value == "single")
	return DwarfFissionKind::Single;

	D.Diag(diag::err_drv_unsupported_option_argument)
	<< Arg->getOption().getName() << Arg->getValue();
	return DwarfFissionKind::None;
	}

	static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
	const llvm::Triple &T, const ArgList &Args,
	bool EmitCodeView, ArgStringList &CmdArgs,
	codegenoptions::DebugInfoKind &DebugInfoKind,
	DwarfFissionKind &DwarfFission) {
	if (Args.hasFlag(options::OPT_fdebug_info_for_profiling,
	options::OPT_fno_debug_info_for_profiling, false) &&
	checkDebugInfoOption(
	Args.getLastArg(options::OPT_fdebug_info_for_profiling), Args, D, TC))
	CmdArgs.push_back("-fdebug-info-for-profiling");

	// The 'g' groups options involve a somewhat intricate sequence of decisions
	// about what to pass from the driver to the frontend, but by the time they
	// reach cc1 they've been factored into three well-defined orthogonal choices:
	// * what level of debug info to generate
	// * what dwarf version to write
	// * what debugger tuning to use
	// This avoids having to monkey around further in cc1 other than to disable
	// codeview if not running in a Windows environment. Perhaps even that
	// decision should be made in the driver as well though.
	llvm::DebuggerKind DebuggerTuning = TC.getDefaultDebuggerTuning();

	bool SplitDWARFInlining =
	Args.hasFlag(options::OPT_fsplit_dwarf_inlining,
	options::OPT_fno_split_dwarf_inlining, false);

	Args.ClaimAllArgs(options::OPT_g_Group);

	Arg* SplitDWARFArg;
	DwarfFission = getDebugFissionKind(D, Args, SplitDWARFArg);

	if (DwarfFission != DwarfFissionKind::None &&
	!checkDebugInfoOption(SplitDWARFArg, Args, D, TC)) {
	DwarfFission = DwarfFissionKind::None;
	SplitDWARFInlining = false;
	}

	if (const Arg *A =
	Args.getLastArg(options::OPT_g_Group, options::OPT_gsplit_dwarf,
	options::OPT_gsplit_dwarf_EQ)) {
	DebugInfoKind = codegenoptions::DebugInfoConstructor;

	// If the last option explicitly specified a debug-info level, use it.
	if (checkDebugInfoOption(A, Args, D, TC) &&
	A->getOption().matches(options::OPT_gN_Group)) {
	DebugInfoKind = DebugLevelToInfoKind(*A);
	// For -g0 or -gline-tables-only, drop -gsplit-dwarf. This gets a bit more
	// complicated if you've disabled inline info in the skeleton CUs
	// (SplitDWARFInlining) - then there's value in composing split-dwarf and
	// line-tables-only, so let those compose naturally in that case.
	if (DebugInfoKind == codegenoptions::NoDebugInfo \|\|
	DebugInfoKind == codegenoptions::DebugDirectivesOnly \|\|
	(DebugInfoKind == codegenoptions::DebugLineTablesOnly &&
	SplitDWARFInlining))
	DwarfFission = DwarfFissionKind::None;
	}
	}

	// If a debugger tuning argument appeared, remember it.
	if (const Arg *A =
	Args.getLastArg(options::OPT_gTune_Group, options::OPT_ggdbN_Group)) {
	if (checkDebugInfoOption(A, Args, D, TC)) {
	if (A->getOption().matches(options::OPT_glldb))
	DebuggerTuning = llvm::DebuggerKind::LLDB;
	else if (A->getOption().matches(options::OPT_gsce))
	DebuggerTuning = llvm::DebuggerKind::SCE;
	else
	DebuggerTuning = llvm::DebuggerKind::GDB;
	}
	}

	// If a -gdwarf argument appeared, remember it.
	const Arg *GDwarfN = Args.getLastArg(
	options::OPT_gdwarf_2, options::OPT_gdwarf_3, options::OPT_gdwarf_4,
	options::OPT_gdwarf_5, options::OPT_gdwarf);
	bool EmitDwarf = false;
	if (GDwarfN) {
	if (checkDebugInfoOption(GDwarfN, Args, D, TC))
	EmitDwarf = true;
	else
	GDwarfN = nullptr;
	}

	if (const Arg *A = Args.getLastArg(options::OPT_gcodeview)) {
	if (checkDebugInfoOption(A, Args, D, TC))
	EmitCodeView = true;
	}

	// If the user asked for debug info but did not explicitly specify -gcodeview
	// or -gdwarf, ask the toolchain for the default format.
	if (!EmitCodeView && !EmitDwarf &&
	DebugInfoKind != codegenoptions::NoDebugInfo) {
	switch (TC.getDefaultDebugFormat()) {
	case codegenoptions::DIF_CodeView:
	EmitCodeView = true;
	break;
	case codegenoptions::DIF_DWARF:
	EmitDwarf = true;
	break;
	}
	}

	unsigned DWARFVersion = 0;
	unsigned DefaultDWARFVersion = ParseDebugDefaultVersion(TC, Args);
	if (EmitDwarf) {
	// Start with the platform default DWARF version
	DWARFVersion = TC.GetDefaultDwarfVersion();
	assert(DWARFVersion && "toolchain default DWARF version must be nonzero");

	// If the user specified a default DWARF version, that takes precedence
	// over the platform default.
	if (DefaultDWARFVersion)
	DWARFVersion = DefaultDWARFVersion;

	// Override with a user-specified DWARF version
	if (GDwarfN)
	if (auto ExplicitVersion = DwarfVersionNum(GDwarfN->getSpelling()))
	DWARFVersion = ExplicitVersion;
	}

	// -gline-directives-only supported only for the DWARF debug info.
	if (DWARFVersion == 0 && DebugInfoKind == codegenoptions::DebugDirectivesOnly)
	DebugInfoKind = codegenoptions::NoDebugInfo;

	// We ignore flag -gstrict-dwarf for now.
	// And we handle flag -grecord-gcc-switches later with DWARFDebugFlags.
	Args.ClaimAllArgs(options::OPT_g_flags_Group);

	// Column info is included by default for everything except SCE and
	// CodeView. Clang doesn't track end columns, just starting columns, which,
	// in theory, is fine for CodeView (and PDB). In practice, however, the
	// Microsoft debuggers don't handle missing end columns well, so it's better
	// not to include any column info.
	if (const Arg *A = Args.getLastArg(options::OPT_gcolumn_info))
	(void)checkDebugInfoOption(A, Args, D, TC);
	if (!Args.hasFlag(options::OPT_gcolumn_info, options::OPT_gno_column_info,
	!EmitCodeView && DebuggerTuning != llvm::DebuggerKind::SCE))
	CmdArgs.push_back("-gno-column-info");

	// FIXME: Move backend command line options to the module.
	// If -gline-tables-only or -gline-directives-only is the last option it wins.
	if (const Arg *A = Args.getLastArg(options::OPT_gmodules))
	if (checkDebugInfoOption(A, Args, D, TC)) {
	if (DebugInfoKind != codegenoptions::DebugLineTablesOnly &&
	DebugInfoKind != codegenoptions::DebugDirectivesOnly) {
	DebugInfoKind = codegenoptions::DebugInfoConstructor;
	CmdArgs.push_back("-dwarf-ext-refs");
	CmdArgs.push_back("-fmodule-format=obj");
	}
	}

	if (T.isOSBinFormatELF() && !SplitDWARFInlining)
	CmdArgs.push_back("-fno-split-dwarf-inlining");

	// After we've dealt with all combinations of things that could
	// make DebugInfoKind be other than None or DebugLineTablesOnly,
	// figure out if we need to "upgrade" it to standalone debug info.
	// We parse these two '-f' options whether or not they will be used,
	// to claim them even if you wrote "-fstandalone-debug -gline-tables-only"
	bool NeedFullDebug = Args.hasFlag(
	options::OPT_fstandalone_debug, options::OPT_fno_standalone_debug,
	DebuggerTuning == llvm::DebuggerKind::LLDB \|\|
	TC.GetDefaultStandaloneDebug());
	if (const Arg *A = Args.getLastArg(options::OPT_fstandalone_debug))
	(void)checkDebugInfoOption(A, Args, D, TC);
	if ((DebugInfoKind == codegenoptions::LimitedDebugInfo \|\|
	DebugInfoKind == codegenoptions::DebugInfoConstructor) &&
	NeedFullDebug)
	DebugInfoKind = codegenoptions::FullDebugInfo;

	if (Args.hasFlag(options::OPT_gembed_source, options::OPT_gno_embed_source,
	false)) {
	// Source embedding is a vendor extension to DWARF v5. By now we have
	// checked if a DWARF version was stated explicitly, and have otherwise
	// fallen back to the target default, so if this is still not at least 5
	// we emit an error.
	const Arg *A = Args.getLastArg(options::OPT_gembed_source);
	if (DWARFVersion < 5)
	D.Diag(diag::err_drv_argument_only_allowed_with)
	<< A->getAsString(Args) << "-gdwarf-5";
	else if (checkDebugInfoOption(A, Args, D, TC))
	CmdArgs.push_back("-gembed-source");
	}

	if (EmitCodeView) {
	CmdArgs.push_back("-gcodeview");

	// Emit codeview type hashes if requested.
	if (Args.hasFlag(options::OPT_gcodeview_ghash,
	options::OPT_gno_codeview_ghash, false)) {
	CmdArgs.push_back("-gcodeview-ghash");
	}
	}

	// Omit inline line tables if requested.
	if (Args.hasFlag(options::OPT_gno_inline_line_tables,
	options::OPT_ginline_line_tables, false)) {
	CmdArgs.push_back("-gno-inline-line-tables");
	}

	// Adjust the debug info kind for the given toolchain.
	TC.adjustDebugInfoKind(DebugInfoKind, Args);

	// When emitting remarks, we need at least debug lines in the output.
	if (willEmitRemarks(Args) &&
	DebugInfoKind <= codegenoptions::DebugDirectivesOnly)
	DebugInfoKind = codegenoptions::DebugLineTablesOnly;

	RenderDebugEnablingArgs(Args, CmdArgs, DebugInfoKind, DWARFVersion,
	DebuggerTuning);

	// -fdebug-macro turns on macro debug info generation.
	if (Args.hasFlag(options::OPT_fdebug_macro, options::OPT_fno_debug_macro,
	false))
	if (checkDebugInfoOption(Args.getLastArg(options::OPT_fdebug_macro), Args,
	D, TC))
	CmdArgs.push_back("-debug-info-macro");

	// -ggnu-pubnames turns on gnu style pubnames in the backend.
	const auto *PubnamesArg =
	Args.getLastArg(options::OPT_ggnu_pubnames, options::OPT_gno_gnu_pubnames,
	options::OPT_gpubnames, options::OPT_gno_pubnames);
	if (DwarfFission != DwarfFissionKind::None \|\|
	(PubnamesArg && checkDebugInfoOption(PubnamesArg, Args, D, TC)))
	if (!PubnamesArg \|\|
	(!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
	!PubnamesArg->getOption().matches(options::OPT_gno_pubnames)))
	CmdArgs.push_back(PubnamesArg && PubnamesArg->getOption().matches(
	options::OPT_gpubnames)
	? "-gpubnames"
	: "-ggnu-pubnames");

	if (Args.hasFlag(options::OPT_fdebug_ranges_base_address,
	options::OPT_fno_debug_ranges_base_address, false)) {
	CmdArgs.push_back("-fdebug-ranges-base-address");
	}

	// -gdwarf-aranges turns on the emission of the aranges section in the
	// backend.
	// Always enabled for SCE tuning.
	bool NeedAranges = DebuggerTuning == llvm::DebuggerKind::SCE;
	if (const Arg *A = Args.getLastArg(options::OPT_gdwarf_aranges))
	NeedAranges = checkDebugInfoOption(A, Args, D, TC) \|\| NeedAranges;
	if (NeedAranges) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-generate-arange-section");
	}

	if (Args.hasFlag(options::OPT_fforce_dwarf_frame,
	options::OPT_fno_force_dwarf_frame, false))
	CmdArgs.push_back("-fforce-dwarf-frame");

	if (Args.hasFlag(options::OPT_fdebug_types_section,
	options::OPT_fno_debug_types_section, false)) {
	if (!T.isOSBinFormatELF()) {
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< Args.getLastArg(options::OPT_fdebug_types_section)
	->getAsString(Args)
	<< T.getTriple();
	} else if (checkDebugInfoOption(
	Args.getLastArg(options::OPT_fdebug_types_section), Args, D,
	TC)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-generate-type-units");
	}
	}

	// Decide how to render forward declarations of template instantiations.
	// SCE wants full descriptions, others just get them in the name.
	if (DebuggerTuning == llvm::DebuggerKind::SCE)
	CmdArgs.push_back("-debug-forward-template-params");

	// Do we need to explicitly import anonymous namespaces into the parent
	// scope?
	if (DebuggerTuning == llvm::DebuggerKind::SCE)
	CmdArgs.push_back("-dwarf-explicit-import");

	RenderDebugInfoCompressionArgs(Args, CmdArgs, D, TC);
	}

	void Clang::ConstructJob(Compilation &C, const JobAction &JA,
	const InputInfo &Output, const InputInfoList &Inputs,
	const ArgList &Args, const char *LinkingOutput) const {
	const auto &TC = getToolChain();
	const llvm::Triple &RawTriple = TC.getTriple();
	const llvm::Triple &Triple = TC.getEffectiveTriple();
	const std::string &TripleStr = Triple.getTriple();

	bool KernelOrKext =
	Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext);
	const Driver &D = TC.getDriver();
	ArgStringList CmdArgs;

	// Check number of inputs for sanity. We need at least one input.
	assert(Inputs.size() >= 1 && "Must have at least one input.");
	// CUDA/HIP compilation may have multiple inputs (source file + results of
	// device-side compilations). OpenMP device jobs also take the host IR as a
	// second input. Module precompilation accepts a list of header files to
	// include as part of the module. All other jobs are expected to have exactly
	// one input.
	bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
	bool IsHIP = JA.isOffloading(Action::OFK_HIP);
	bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
	bool IsHeaderModulePrecompile = isa<HeaderModulePrecompileJobAction>(JA);

	// A header module compilation doesn't have a main input file, so invent a
	// fake one as a placeholder.
	const char *ModuleName = [&]{
	auto *ModuleNameArg = Args.getLastArg(options::OPT_fmodule_name_EQ);
	return ModuleNameArg ? ModuleNameArg->getValue() : "";
	}();
	InputInfo HeaderModuleInput(Inputs[0].getType(), ModuleName, ModuleName);

	const InputInfo &Input =
	IsHeaderModulePrecompile ? HeaderModuleInput : Inputs[0];

	InputInfoList ModuleHeaderInputs;
	const InputInfo *CudaDeviceInput = nullptr;
	const InputInfo *OpenMPDeviceInput = nullptr;
	for (const InputInfo &I : Inputs) {
	if (&I == &Input) {
	// This is the primary input.
	} else if (IsHeaderModulePrecompile &&
	types::getPrecompiledType(I.getType()) == types::TY_PCH) {
	types::ID Expected = HeaderModuleInput.getType();
	if (I.getType() != Expected) {
	D.Diag(diag::err_drv_module_header_wrong_kind)
	<< I.getFilename() << types::getTypeName(I.getType())
	<< types::getTypeName(Expected);
	}
	ModuleHeaderInputs.push_back(I);
	} else if ((IsCuda \|\| IsHIP) && !CudaDeviceInput) {
	CudaDeviceInput = &I;
	} else if (IsOpenMPDevice && !OpenMPDeviceInput) {
	OpenMPDeviceInput = &I;
	} else {
	llvm_unreachable("unexpectedly given multiple inputs");
	}
	}

	const llvm::Triple *AuxTriple =
	(IsCuda \|\| IsHIP) ? TC.getAuxTriple() : nullptr;
	bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment();
	bool IsIAMCU = RawTriple.isOSIAMCU();

	// Adjust IsWindowsXYZ for CUDA/HIP compilations. Even when compiling in
	// device mode (i.e., getToolchain().getTriple() is NVPTX/AMDGCN, not
	// Windows), we need to pass Windows-specific flags to cc1.
	if (IsCuda \|\| IsHIP)
	IsWindowsMSVC \|= AuxTriple && AuxTriple->isWindowsMSVCEnvironment();

	// C++ is not supported for IAMCU.
	if (IsIAMCU && types::isCXX(Input.getType()))
	D.Diag(diag::err_drv_clang_unsupported) << "C++ for IAMCU";

	// Invoke ourselves in -cc1 mode.
	//
	// FIXME: Implement custom jobs for internal actions.
	CmdArgs.push_back("-cc1");

	// Add the "effective" target triple.
	CmdArgs.push_back("-triple");
	CmdArgs.push_back(Args.MakeArgString(TripleStr));

	if (const Arg *MJ = Args.getLastArg(options::OPT_MJ)) {
	DumpCompilationDatabase(C, MJ->getValue(), TripleStr, Output, Input, Args);
	Args.ClaimAllArgs(options::OPT_MJ);
	} else if (const Arg *GenCDBFragment =
	Args.getLastArg(options::OPT_gen_cdb_fragment_path)) {
	DumpCompilationDatabaseFragmentToDir(GenCDBFragment->getValue(), C,
	TripleStr, Output, Input, Args);
	Args.ClaimAllArgs(options::OPT_gen_cdb_fragment_path);
	}

	if (IsCuda \|\| IsHIP) {
	// We have to pass the triple of the host if compiling for a CUDA/HIP device
	// and vice-versa.
	std::string NormalizedTriple;
	if (JA.isDeviceOffloading(Action::OFK_Cuda) \|\|
	JA.isDeviceOffloading(Action::OFK_HIP))
	NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
	->getTriple()
	.normalize();
	else {
	// Host-side compilation.
	NormalizedTriple =
	(IsCuda ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
	: C.getSingleOffloadToolChain<Action::OFK_HIP>())
	->getTriple()
	.normalize();
	if (IsCuda) {
	// We need to figure out which CUDA version we're compiling for, as that
	// determines how we load and launch GPU kernels.
	auto CTC = static_cast<const toolchains::CudaToolChain >(
	C.getSingleOffloadToolChain<Action::OFK_Cuda>());
	assert(CTC && "Expected valid CUDA Toolchain.");
	if (CTC && CTC->CudaInstallation.version() != CudaVersion::UNKNOWN)
	CmdArgs.push_back(Args.MakeArgString(
	Twine("-target-sdk-version=") +
	CudaVersionToString(CTC->CudaInstallation.version())));
	}
	}
	CmdArgs.push_back("-aux-triple");
	CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
	}

	if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false)) {
	CmdArgs.push_back("-fsycl");
	CmdArgs.push_back("-fsycl-is-device");

	if (Arg *A = Args.getLastArg(options::OPT_sycl_std_EQ)) {
	A->render(Args, CmdArgs);
	} else {
	// Ensure the default version in SYCL mode is 1.2.1 (aka 2017)
	CmdArgs.push_back("-sycl-std=2017");
	}
	}

	if (IsOpenMPDevice) {
	// We have to pass the triple of the host if compiling for an OpenMP device.
	std::string NormalizedTriple =
	C.getSingleOffloadToolChain<Action::OFK_Host>()
	->getTriple()
	.normalize();
	CmdArgs.push_back("-aux-triple");
	CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
	}

	if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm \|\|
	Triple.getArch() == llvm::Triple::thumb)) {
	unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6;
	unsigned Version = 0;
	bool Failure =
	Triple.getArchName().substr(Offset).consumeInteger(10, Version);
	if (Failure \|\| Version < 7)
	D.Diag(diag::err_target_unsupported_arch) << Triple.getArchName()
	<< TripleStr;
	}

	// Push all default warning arguments that are specific to
	// the given target. These come before user provided warning options
	// are provided.
	TC.addClangWarningOptions(CmdArgs);

	// Select the appropriate action.
	RewriteKind rewriteKind = RK_None;

	// If CollectArgsForIntegratedAssembler() isn't called below, claim the args
	// it claims when not running an assembler. Otherwise, clang would emit
	// "argument unused" warnings for assembler flags when e.g. adding "-E" to
	// flags while debugging something. That'd be somewhat inconvenient, and it's
	// also inconsistent with most other flags -- we don't warn on
	// -ffunction-sections not being used in -E mode either for example, even
	// though it's not really used either.
	if (!isa<AssembleJobAction>(JA)) {
	// The args claimed here should match the args used in
	// CollectArgsForIntegratedAssembler().
	if (TC.useIntegratedAs()) {
	Args.ClaimAllArgs(options::OPT_mrelax_all);
	Args.ClaimAllArgs(options::OPT_mno_relax_all);
	Args.ClaimAllArgs(options::OPT_mincremental_linker_compatible);
	Args.ClaimAllArgs(options::OPT_mno_incremental_linker_compatible);
	switch (C.getDefaultToolChain().getArch()) {
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	Args.ClaimAllArgs(options::OPT_mimplicit_it_EQ);
	break;
	default:
	break;
	}
	}
	Args.ClaimAllArgs(options::OPT_Wa_COMMA);
	Args.ClaimAllArgs(options::OPT_Xassembler);
	}

	if (isa<AnalyzeJobAction>(JA)) {
	assert(JA.getType() == types::TY_Plist && "Invalid output type.");
	CmdArgs.push_back("-analyze");
	} else if (isa<MigrateJobAction>(JA)) {
	CmdArgs.push_back("-migrate");
	} else if (isa<PreprocessJobAction>(JA)) {
	if (Output.getType() == types::TY_Dependencies)
	CmdArgs.push_back("-Eonly");
	else {
	CmdArgs.push_back("-E");
	if (Args.hasArg(options::OPT_rewrite_objc) &&
	!Args.hasArg(options::OPT_g_Group))
	CmdArgs.push_back("-P");
	}
	} else if (isa<AssembleJobAction>(JA)) {
	CmdArgs.push_back("-emit-obj");

	CollectArgsForIntegratedAssembler(C, Args, CmdArgs, D);

	// Also ignore explicit -force_cpusubtype_ALL option.
	(void)Args.hasArg(options::OPT_force__cpusubtype__ALL);
	} else if (isa<PrecompileJobAction>(JA)) {
	if (JA.getType() == types::TY_Nothing)
	CmdArgs.push_back("-fsyntax-only");
	else if (JA.getType() == types::TY_ModuleFile)
	CmdArgs.push_back(IsHeaderModulePrecompile
	? "-emit-header-module"
	: "-emit-module-interface");
	else
	CmdArgs.push_back("-emit-pch");
	} else if (isa<VerifyPCHJobAction>(JA)) {
	CmdArgs.push_back("-verify-pch");
	} else {
	assert((isa<CompileJobAction>(JA) \|\| isa<BackendJobAction>(JA)) &&
	"Invalid action for clang tool.");
	if (JA.getType() == types::TY_Nothing) {
	CmdArgs.push_back("-fsyntax-only");
	} else if (JA.getType() == types::TY_LLVM_IR \|\|
	JA.getType() == types::TY_LTO_IR) {
	CmdArgs.push_back("-emit-llvm");
	} else if (JA.getType() == types::TY_LLVM_BC \|\|
	JA.getType() == types::TY_LTO_BC) {
	CmdArgs.push_back("-emit-llvm-bc");
	} else if (JA.getType() == types::TY_IFS \|\|
	JA.getType() == types::TY_IFS_CPP) {
	StringRef ArgStr =
	Args.hasArg(options::OPT_interface_stub_version_EQ)
	? Args.getLastArgValue(options::OPT_interface_stub_version_EQ)
	: "experimental-ifs-v2";
	CmdArgs.push_back("-emit-interface-stubs");
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-interface-stub-version=") + ArgStr.str()));
	} else if (JA.getType() == types::TY_PP_Asm) {
	CmdArgs.push_back("-S");
	} else if (JA.getType() == types::TY_AST) {
	CmdArgs.push_back("-emit-pch");
	} else if (JA.getType() == types::TY_ModuleFile) {
	CmdArgs.push_back("-module-file-info");
	} else if (JA.getType() == types::TY_RewrittenObjC) {
	CmdArgs.push_back("-rewrite-objc");
	rewriteKind = RK_NonFragile;
	} else if (JA.getType() == types::TY_RewrittenLegacyObjC) {
	CmdArgs.push_back("-rewrite-objc");
	rewriteKind = RK_Fragile;
	} else {
	assert(JA.getType() == types::TY_PP_Asm && "Unexpected output type!");
	}

	// Preserve use-list order by default when emitting bitcode, so that
	// loading the bitcode up in 'opt' or 'llc' and running passes gives the
	// same result as running passes here. For LTO, we don't need to preserve
	// the use-list order, since serialization to bitcode is part of the flow.
	if (JA.getType() == types::TY_LLVM_BC)
	CmdArgs.push_back("-emit-llvm-uselists");

	// Device-side jobs do not support LTO.
	bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) \|\|
	JA.isDeviceOffloading(Action::OFK_Host));

	if (D.isUsingLTO() && !isDeviceOffloadAction) {
	Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
	CmdArgs.push_back("-flto-unit");
	}
	}

	if (const Arg *A = Args.getLastArg(options::OPT_fthinlto_index_EQ)) {
	if (!types::isLLVMIR(Input.getType()))
	D.Diag(diag::err_drv_arg_requires_bitcode_input) << A->getAsString(Args);
	Args.AddLastArg(CmdArgs, options::OPT_fthinlto_index_EQ);
	}

	if (Args.getLastArg(options::OPT_fthin_link_bitcode_EQ))
	Args.AddLastArg(CmdArgs, options::OPT_fthin_link_bitcode_EQ);

	if (Args.getLastArg(options::OPT_save_temps_EQ))
	Args.AddLastArg(CmdArgs, options::OPT_save_temps_EQ);

	// Embed-bitcode option.
	// Only white-listed flags below are allowed to be embedded.
	if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() &&
	(isa<BackendJobAction>(JA) \|\| isa<AssembleJobAction>(JA))) {
	// Add flags implied by -fembed-bitcode.
	Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ);
	// Disable all llvm IR level optimizations.
	CmdArgs.push_back("-disable-llvm-passes");

	// Render target options.
	TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());

	// reject options that shouldn't be supported in bitcode
	// also reject kernel/kext
	static const constexpr unsigned kBitcodeOptionBlacklist[] = {
	options::OPT_mkernel,
	options::OPT_fapple_kext,
	options::OPT_ffunction_sections,
	options::OPT_fno_function_sections,
	options::OPT_fdata_sections,
	options::OPT_fno_data_sections,
	options::OPT_fbasic_block_sections_EQ,
	options::OPT_funique_internal_linkage_names,
	options::OPT_fno_unique_internal_linkage_names,
	options::OPT_funique_section_names,
	options::OPT_fno_unique_section_names,
	options::OPT_funique_basic_block_section_names,
	options::OPT_fno_unique_basic_block_section_names,
	options::OPT_mrestrict_it,
	options::OPT_mno_restrict_it,
	options::OPT_mstackrealign,
	options::OPT_mno_stackrealign,
	options::OPT_mstack_alignment,
	options::OPT_mcmodel_EQ,
	options::OPT_mlong_calls,
	options::OPT_mno_long_calls,
	options::OPT_ggnu_pubnames,
	options::OPT_gdwarf_aranges,
	options::OPT_fdebug_types_section,
	options::OPT_fno_debug_types_section,
	options::OPT_fdwarf_directory_asm,
	options::OPT_fno_dwarf_directory_asm,
	options::OPT_mrelax_all,
	options::OPT_mno_relax_all,
	options::OPT_ftrap_function_EQ,
	options::OPT_ffixed_r9,
	options::OPT_mfix_cortex_a53_835769,
	options::OPT_mno_fix_cortex_a53_835769,
	options::OPT_ffixed_x18,
	options::OPT_mglobal_merge,
	options::OPT_mno_global_merge,
	options::OPT_mred_zone,
	options::OPT_mno_red_zone,
	options::OPT_Wa_COMMA,
	options::OPT_Xassembler,
	options::OPT_mllvm,
	};
	for (const auto &A : Args)
	if (llvm::find(kBitcodeOptionBlacklist, A->getOption().getID()) !=
	std::end(kBitcodeOptionBlacklist))
	D.Diag(diag::err_drv_unsupported_embed_bitcode) << A->getSpelling();

	// Render the CodeGen options that need to be passed.
	if (!Args.hasFlag(options::OPT_foptimize_sibling_calls,
	options::OPT_fno_optimize_sibling_calls))
	CmdArgs.push_back("-mdisable-tail-calls");

	RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args,
	CmdArgs, JA);

	// Render ABI arguments
	switch (TC.getArch()) {
	default: break;
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumbeb:
	RenderARMABI(Triple, Args, CmdArgs);
	break;
	case llvm::Triple::aarch64:
	case llvm::Triple::aarch64_32:
	case llvm::Triple::aarch64_be:
	RenderAArch64ABI(Triple, Args, CmdArgs);
	break;
	}

	// Optimization level for CodeGen.
	if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) {
	if (A->getOption().matches(options::OPT_O4)) {
	CmdArgs.push_back("-O3");
	D.Diag(diag::warn_O4_is_O3);
	} else {
	A->render(Args, CmdArgs);
	}
	}

	// Input/Output file.
	if (Output.getType() == types::TY_Dependencies) {
	// Handled with other dependency code.
	} else if (Output.isFilename()) {
	CmdArgs.push_back("-o");
	CmdArgs.push_back(Output.getFilename());
	} else {
	assert(Output.isNothing() && "Input output.");
	}

	for (const auto &II : Inputs) {
	addDashXForInput(Args, II, CmdArgs);
	if (II.isFilename())
	CmdArgs.push_back(II.getFilename());
	else
	II.getInputArg().renderAsInput(Args, CmdArgs);
	}

	C.addCommand(
	std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileUTF8(),
	D.getClangProgramPath(), CmdArgs, Inputs));
	return;
	}

	if (C.getDriver().embedBitcodeMarkerOnly() && !C.getDriver().isUsingLTO())
	CmdArgs.push_back("-fembed-bitcode=marker");

	// We normally speed up the clang process a bit by skipping destructors at
	// exit, but when we're generating diagnostics we can rely on some of the
	// cleanup.
	if (!C.isForDiagnostics())
	CmdArgs.push_back("-disable-free");

	#ifdef NDEBUG
	const bool IsAssertBuild = false;
	#else
	const bool IsAssertBuild = true;
	#endif

	// Disable the verification pass in -asserts builds.
	if (!IsAssertBuild)
	CmdArgs.push_back("-disable-llvm-verifier");

	// Discard value names in assert builds unless otherwise specified.
	if (Args.hasFlag(options::OPT_fdiscard_value_names,
	options::OPT_fno_discard_value_names, !IsAssertBuild)) {
	if (Args.hasArg(options::OPT_fdiscard_value_names) &&
	(std::any_of(Inputs.begin(), Inputs.end(),
	[](const clang::driver::InputInfo &II) {
	return types::isLLVMIR(II.getType());
	}))) {
	D.Diag(diag::warn_ignoring_fdiscard_for_bitcode);
	}
	CmdArgs.push_back("-discard-value-names");
	}

	// Set the main file name, so that debug info works even with
	// -save-temps.
	CmdArgs.push_back("-main-file-name");
	CmdArgs.push_back(getBaseInputName(Args, Input));

	// Some flags which affect the language (via preprocessor
	// defines).
	if (Args.hasArg(options::OPT_static))
	CmdArgs.push_back("-static-define");

	if (Args.hasArg(options::OPT_municode))
	CmdArgs.push_back("-DUNICODE");

	if (isa<AnalyzeJobAction>(JA))
	RenderAnalyzerOptions(Args, CmdArgs, Triple, Input);

	if (isa<AnalyzeJobAction>(JA) \|\|
	(isa<PreprocessJobAction>(JA) && Args.hasArg(options::OPT__analyze)))
	CmdArgs.push_back("-setup-static-analyzer");

	// Enable compatilibily mode to avoid analyzer-config related errors.
	// Since we can't access frontend flags through hasArg, let's manually iterate
	// through them.
	bool FoundAnalyzerConfig = false;
	for (auto Arg : Args.filtered(options::OPT_Xclang))
	if (StringRef(Arg->getValue()) == "-analyzer-config") {
	FoundAnalyzerConfig = true;
	break;
	}
	if (!FoundAnalyzerConfig)
	for (auto Arg : Args.filtered(options::OPT_Xanalyzer))
	if (StringRef(Arg->getValue()) == "-analyzer-config") {
	FoundAnalyzerConfig = true;
	break;
	}
	if (FoundAnalyzerConfig)
	CmdArgs.push_back("-analyzer-config-compatibility-mode=true");

	CheckCodeGenerationOptions(D, Args);

	unsigned FunctionAlignment = ParseFunctionAlignment(TC, Args);
	assert(FunctionAlignment <= 31 && "function alignment will be truncated!");
	if (FunctionAlignment) {
	CmdArgs.push_back("-function-alignment");
	CmdArgs.push_back(Args.MakeArgString(std::to_string(FunctionAlignment)));
	}

	llvm::Reloc::Model RelocationModel;
	unsigned PICLevel;
	bool IsPIE;
	std::tie(RelocationModel, PICLevel, IsPIE) = ParsePICArgs(TC, Args);

	bool IsROPI = RelocationModel == llvm::Reloc::ROPI \|\|
	RelocationModel == llvm::Reloc::ROPI_RWPI;
	bool IsRWPI = RelocationModel == llvm::Reloc::RWPI \|\|
	RelocationModel == llvm::Reloc::ROPI_RWPI;

	if (Args.hasArg(options::OPT_mcmse) &&
	!Args.hasArg(options::OPT_fallow_unsupported)) {
	if (IsROPI)
	D.Diag(diag::err_cmse_pi_are_incompatible) << IsROPI;
	if (IsRWPI)
	D.Diag(diag::err_cmse_pi_are_incompatible) << !IsRWPI;
	}

	if (IsROPI && types::isCXX(Input.getType()) &&
	!Args.hasArg(options::OPT_fallow_unsupported))
	D.Diag(diag::err_drv_ropi_incompatible_with_cxx);

	const char *RMName = RelocationModelName(RelocationModel);
	if (RMName) {
	CmdArgs.push_back("-mrelocation-model");
	CmdArgs.push_back(RMName);
	}
	if (PICLevel > 0) {
	CmdArgs.push_back("-pic-level");
	CmdArgs.push_back(PICLevel == 1 ? "1" : "2");
	if (IsPIE)
	CmdArgs.push_back("-pic-is-pie");
	}

	if (RelocationModel == llvm::Reloc::ROPI \|\|
	RelocationModel == llvm::Reloc::ROPI_RWPI)
	CmdArgs.push_back("-fropi");
	if (RelocationModel == llvm::Reloc::RWPI \|\|
	RelocationModel == llvm::Reloc::ROPI_RWPI)
	CmdArgs.push_back("-frwpi");

	if (Arg *A = Args.getLastArg(options::OPT_meabi)) {
	CmdArgs.push_back("-meabi");
	CmdArgs.push_back(A->getValue());
	}

	// The default is -fno-semantic-interposition. We render it just because we
	// require explicit -fno-semantic-interposition to infer dso_local.
	if (Arg *A = Args.getLastArg(options::OPT_fsemantic_interposition,
	options::OPT_fno_semantic_interposition))
	if (RelocationModel != llvm::Reloc::Static && !IsPIE)
	A->render(Args, CmdArgs);

	{
	std::string Model;
	if (Arg *A = Args.getLastArg(options::OPT_mthread_model)) {
	if (!TC.isThreadModelSupported(A->getValue()))
	D.Diag(diag::err_drv_invalid_thread_model_for_target)
	<< A->getValue() << A->getAsString(Args);
	Model = A->getValue();
	} else
	Model = TC.getThreadModel();
	if (Model != "posix") {
	CmdArgs.push_back("-mthread-model");
	CmdArgs.push_back(Args.MakeArgString(Model));
	}
	}

	Args.AddLastArg(CmdArgs, options::OPT_fveclib);

	if (Args.hasFlag(options::OPT_fmerge_all_constants,
	options::OPT_fno_merge_all_constants, false))
	CmdArgs.push_back("-fmerge-all-constants");

	if (Args.hasFlag(options::OPT_fno_delete_null_pointer_checks,
	options::OPT_fdelete_null_pointer_checks, false))
	CmdArgs.push_back("-fno-delete-null-pointer-checks");

	// LLVM Code Generator Options.

	if (Args.hasArg(options::OPT_frewrite_map_file) \|\|
	Args.hasArg(options::OPT_frewrite_map_file_EQ)) {
	for (const Arg *A : Args.filtered(options::OPT_frewrite_map_file,
	options::OPT_frewrite_map_file_EQ)) {
	StringRef Map = A->getValue();
	if (!llvm::sys::fs::exists(Map)) {
	D.Diag(diag::err_drv_no_such_file) << Map;
	} else {
	CmdArgs.push_back("-frewrite-map-file");
	CmdArgs.push_back(A->getValue());
	A->claim();
	}
	}
	}

	if (Arg *A = Args.getLastArg(options::OPT_Wframe_larger_than_EQ)) {
	StringRef v = A->getValue();
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-warn-stack-size=" + v));
	A->claim();
	}

	if (!Args.hasFlag(options::OPT_fjump_tables, options::OPT_fno_jump_tables,
	true))
	CmdArgs.push_back("-fno-jump-tables");

	if (Args.hasFlag(options::OPT_fprofile_sample_accurate,
	options::OPT_fno_profile_sample_accurate, false))
	CmdArgs.push_back("-fprofile-sample-accurate");

	if (!Args.hasFlag(options::OPT_fpreserve_as_comments,
	options::OPT_fno_preserve_as_comments, true))
	CmdArgs.push_back("-fno-preserve-as-comments");

	if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
	CmdArgs.push_back("-mregparm");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_maix_struct_return,
	options::OPT_msvr4_struct_return)) {
	if (TC.getArch() != llvm::Triple::ppc) {
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getSpelling() << RawTriple.str();
	} else if (A->getOption().matches(options::OPT_maix_struct_return)) {
	CmdArgs.push_back("-maix-struct-return");
	} else {
	assert(A->getOption().matches(options::OPT_msvr4_struct_return));
	CmdArgs.push_back("-msvr4-struct-return");
	}
	}

	if (Arg *A = Args.getLastArg(options::OPT_fpcc_struct_return,
	options::OPT_freg_struct_return)) {
	if (TC.getArch() != llvm::Triple::x86) {
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getSpelling() << RawTriple.str();
	} else if (A->getOption().matches(options::OPT_fpcc_struct_return)) {
	CmdArgs.push_back("-fpcc-struct-return");
	} else {
	assert(A->getOption().matches(options::OPT_freg_struct_return));
	CmdArgs.push_back("-freg-struct-return");
	}
	}

	if (Args.hasFlag(options::OPT_mrtd, options::OPT_mno_rtd, false))
	CmdArgs.push_back("-fdefault-calling-conv=stdcall");

	if (Args.hasArg(options::OPT_fenable_matrix)) {
	// enable-matrix is needed by both the LangOpts and by LLVM.
	CmdArgs.push_back("-fenable-matrix");
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-enable-matrix");
	}

	CodeGenOptions::FramePointerKind FPKeepKind =
	getFramePointerKind(Args, RawTriple);
	const char *FPKeepKindStr = nullptr;
	switch (FPKeepKind) {
	case CodeGenOptions::FramePointerKind::None:
	FPKeepKindStr = "-mframe-pointer=none";
	break;
	case CodeGenOptions::FramePointerKind::NonLeaf:
	FPKeepKindStr = "-mframe-pointer=non-leaf";
	break;
	case CodeGenOptions::FramePointerKind::All:
	FPKeepKindStr = "-mframe-pointer=all";
	break;
	}
	assert(FPKeepKindStr && "unknown FramePointerKind");
	CmdArgs.push_back(FPKeepKindStr);

	if (!Args.hasFlag(options::OPT_fzero_initialized_in_bss,
	options::OPT_fno_zero_initialized_in_bss, true))
	CmdArgs.push_back("-fno-zero-initialized-in-bss");

	bool OFastEnabled = isOptimizationLevelFast(Args);
	// If -Ofast is the optimization level, then -fstrict-aliasing should be
	// enabled. This alias option is being used to simplify the hasFlag logic.
	OptSpecifier StrictAliasingAliasOption =
	OFastEnabled ? options::OPT_Ofast : options::OPT_fstrict_aliasing;
	// We turn strict aliasing off by default if we're in CL mode, since MSVC
	// doesn't do any TBAA.
	bool TBAAOnByDefault = !D.IsCLMode();
	if (!Args.hasFlag(options::OPT_fstrict_aliasing, StrictAliasingAliasOption,
	options::OPT_fno_strict_aliasing, TBAAOnByDefault))
	CmdArgs.push_back("-relaxed-aliasing");
	if (!Args.hasFlag(options::OPT_fstruct_path_tbaa,
	options::OPT_fno_struct_path_tbaa))
	CmdArgs.push_back("-no-struct-path-tbaa");
	if (Args.hasFlag(options::OPT_fstrict_enums, options::OPT_fno_strict_enums,
	false))
	CmdArgs.push_back("-fstrict-enums");
	if (!Args.hasFlag(options::OPT_fstrict_return, options::OPT_fno_strict_return,
	true))
	CmdArgs.push_back("-fno-strict-return");
	if (Args.hasFlag(options::OPT_fallow_editor_placeholders,
	options::OPT_fno_allow_editor_placeholders, false))
	CmdArgs.push_back("-fallow-editor-placeholders");
	if (Args.hasFlag(options::OPT_fstrict_vtable_pointers,
	options::OPT_fno_strict_vtable_pointers,
	false))
	CmdArgs.push_back("-fstrict-vtable-pointers");
	if (Args.hasFlag(options::OPT_fforce_emit_vtables,
	options::OPT_fno_force_emit_vtables,
	false))
	CmdArgs.push_back("-fforce-emit-vtables");
	if (!Args.hasFlag(options::OPT_foptimize_sibling_calls,
	options::OPT_fno_optimize_sibling_calls))
	CmdArgs.push_back("-mdisable-tail-calls");
	if (Args.hasFlag(options::OPT_fno_escaping_block_tail_calls,
	options::OPT_fescaping_block_tail_calls, false))
	CmdArgs.push_back("-fno-escaping-block-tail-calls");

	Args.AddLastArg(CmdArgs, options::OPT_ffine_grained_bitfield_accesses,
	options::OPT_fno_fine_grained_bitfield_accesses);

	// Handle segmented stacks.
	if (Args.hasArg(options::OPT_fsplit_stack))
	CmdArgs.push_back("-split-stacks");

	RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, JA);

	if (Arg *A = Args.getLastArg(options::OPT_mdouble_EQ)) {
	if (TC.getArch() == llvm::Triple::avr)
	A->render(Args, CmdArgs);
	else
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getAsString(Args) << TripleStr;
	}

	if (Arg *A = Args.getLastArg(options::OPT_LongDouble_Group)) {
	if (TC.getTriple().isX86())
	A->render(Args, CmdArgs);
	else if ((TC.getArch() == llvm::Triple::ppc \|\| TC.getTriple().isPPC64()) &&
	(A->getOption().getID() != options::OPT_mlong_double_80))
	A->render(Args, CmdArgs);
	else
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getAsString(Args) << TripleStr;
	}

	// Decide whether to use verbose asm. Verbose assembly is the default on
	// toolchains which have the integrated assembler on by default.
	bool IsIntegratedAssemblerDefault = TC.IsIntegratedAssemblerDefault();
	if (!Args.hasFlag(options::OPT_fverbose_asm, options::OPT_fno_verbose_asm,
	IsIntegratedAssemblerDefault))
	CmdArgs.push_back("-fno-verbose-asm");

	if (!TC.useIntegratedAs())
	CmdArgs.push_back("-no-integrated-as");

	if (Args.hasArg(options::OPT_fdebug_pass_structure)) {
	CmdArgs.push_back("-mdebug-pass");
	CmdArgs.push_back("Structure");
	}
	if (Args.hasArg(options::OPT_fdebug_pass_arguments)) {
	CmdArgs.push_back("-mdebug-pass");
	CmdArgs.push_back("Arguments");
	}

	// Enable -mconstructor-aliases except on darwin, where we have to work around
	// a linker bug (see <rdar://problem/7651567>), and CUDA device code, where
	// aliases aren't supported. Similarly, aliases aren't yet supported for AIX.
	if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX() && !RawTriple.isOSAIX())
	CmdArgs.push_back("-mconstructor-aliases");

	// Darwin's kernel doesn't support guard variables; just die if we
	// try to use them.
	if (KernelOrKext && RawTriple.isOSDarwin())
	CmdArgs.push_back("-fforbid-guard-variables");

	if (Args.hasFlag(options::OPT_mms_bitfields, options::OPT_mno_ms_bitfields,
	Triple.isWindowsGNUEnvironment())) {
	CmdArgs.push_back("-mms-bitfields");
	}

	if (Args.hasFlag(options::OPT_mpie_copy_relocations,
	options::OPT_mno_pie_copy_relocations,
	false)) {
	CmdArgs.push_back("-mpie-copy-relocations");
	}

	if (Args.hasFlag(options::OPT_fno_plt, options::OPT_fplt, false)) {
	CmdArgs.push_back("-fno-plt");
	}

	// -fhosted is default.
	// TODO: Audit uses of KernelOrKext and see where it'd be more appropriate to
	// use Freestanding.
	bool Freestanding =
	Args.hasFlag(options::OPT_ffreestanding, options::OPT_fhosted, false) \|\|
	KernelOrKext;
	if (Freestanding)
	CmdArgs.push_back("-ffreestanding");

	// This is a coarse approximation of what llvm-gcc actually does, both
	// -fasynchronous-unwind-tables and -fnon-call-exceptions interact in more
	// complicated ways.
	bool AsynchronousUnwindTables =
	Args.hasFlag(options::OPT_fasynchronous_unwind_tables,
	options::OPT_fno_asynchronous_unwind_tables,
	(TC.IsUnwindTablesDefault(Args) \|\|
	TC.getSanitizerArgs().needsUnwindTables()) &&
	!Freestanding);
	if (Args.hasFlag(options::OPT_funwind_tables, options::OPT_fno_unwind_tables,
	AsynchronousUnwindTables))
	CmdArgs.push_back("-munwind-tables");

	// Prepare `-aux-target-cpu` and `-aux-target-feature` unless
	// `--gpu-use-aux-triple-only` is specified.
	if (!Args.getLastArg(options::OPT_gpu_use_aux_triple_only) &&
	((IsCuda && JA.isDeviceOffloading(Action::OFK_Cuda)) \|\|
	(IsHIP && JA.isDeviceOffloading(Action::OFK_HIP)))) {
	const ArgList &HostArgs =
	C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_None);
	std::string HostCPU =
	getCPUName(HostArgs, TC.getAuxTriple(), /FromAs*/ false);
	if (!HostCPU.empty()) {
	CmdArgs.push_back("-aux-target-cpu");
	CmdArgs.push_back(Args.MakeArgString(HostCPU));
	}
	getTargetFeatures(D, *TC.getAuxTriple(), HostArgs, CmdArgs,
	/ForAS/ false, /IsAux/ true);
	}

	TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());

	// FIXME: Handle -mtune=.
	(void)Args.hasArg(options::OPT_mtune_EQ);

	if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
	StringRef CM = A->getValue();
	if (CM == "small" \|\| CM == "kernel" \|\| CM == "medium" \|\| CM == "large" \|\|
	CM == "tiny")
	A->render(Args, CmdArgs);
	else
	D.Diag(diag::err_drv_invalid_argument_to_option)
	<< CM << A->getOption().getName();
	}

	if (Arg *A = Args.getLastArg(options::OPT_mtls_size_EQ)) {
	StringRef Value = A->getValue();
	unsigned TLSSize = 0;
	Value.getAsInteger(10, TLSSize);
	if (!Triple.isAArch64() \|\| !Triple.isOSBinFormatELF())
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getOption().getName() << TripleStr;
	if (TLSSize != 12 && TLSSize != 24 && TLSSize != 32 && TLSSize != 48)
	D.Diag(diag::err_drv_invalid_int_value)
	<< A->getOption().getName() << Value;
	Args.AddLastArg(CmdArgs, options::OPT_mtls_size_EQ);
	}

	// Add the target cpu
	std::string CPU = getCPUName(Args, Triple, /FromAs/ false);
	if (!CPU.empty()) {
	CmdArgs.push_back("-target-cpu");
	CmdArgs.push_back(Args.MakeArgString(CPU));
	}

	RenderTargetOptions(Triple, Args, KernelOrKext, CmdArgs);

	// These two are potentially updated by AddClangCLArgs.
	codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
	bool EmitCodeView = false;

	// Add clang-cl arguments.
	types::ID InputType = Input.getType();
	if (D.IsCLMode())
	AddClangCLArgs(Args, InputType, CmdArgs, &DebugInfoKind, &EmitCodeView);

	DwarfFissionKind DwarfFission;
	RenderDebugOptions(TC, D, RawTriple, Args, EmitCodeView, CmdArgs,
	DebugInfoKind, DwarfFission);

	// Add the split debug info name to the command lines here so we
	// can propagate it to the backend.
	bool SplitDWARF = (DwarfFission != DwarfFissionKind::None) &&
	TC.getTriple().isOSBinFormatELF() &&
	(isa<AssembleJobAction>(JA) \|\| isa<CompileJobAction>(JA) \|\|
	isa<BackendJobAction>(JA));
	if (SplitDWARF) {
	const char *SplitDWARFOut = SplitDebugName(Args, Input, Output);
	CmdArgs.push_back("-split-dwarf-file");
	CmdArgs.push_back(SplitDWARFOut);
	if (DwarfFission == DwarfFissionKind::Split) {
	CmdArgs.push_back("-split-dwarf-output");
	CmdArgs.push_back(SplitDWARFOut);
	}
	}

	// Pass the linker version in use.
	if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) {
	CmdArgs.push_back("-target-linker-version");
	CmdArgs.push_back(A->getValue());
	}

	// Explicitly error on some things we know we don't support and can't just
	// ignore.
	if (!Args.hasArg(options::OPT_fallow_unsupported)) {
	Arg *Unsupported;
	if (types::isCXX(InputType) && RawTriple.isOSDarwin() &&
	TC.getArch() == llvm::Triple::x86) {
	if ((Unsupported = Args.getLastArg(options::OPT_fapple_kext)) \|\|
	(Unsupported = Args.getLastArg(options::OPT_mkernel)))
	D.Diag(diag::err_drv_clang_unsupported_opt_cxx_darwin_i386)
	<< Unsupported->getOption().getName();
	}
	// The faltivec option has been superseded by the maltivec option.
	if ((Unsupported = Args.getLastArg(options::OPT_faltivec)))
	D.Diag(diag::err_drv_clang_unsupported_opt_faltivec)
	<< Unsupported->getOption().getName()
	<< "please use -maltivec and include altivec.h explicitly";
	if ((Unsupported = Args.getLastArg(options::OPT_fno_altivec)))
	D.Diag(diag::err_drv_clang_unsupported_opt_faltivec)
	<< Unsupported->getOption().getName() << "please use -mno-altivec";
	}

	Args.AddAllArgs(CmdArgs, options::OPT_v);

	if (Args.getLastArg(options::OPT_H)) {
	CmdArgs.push_back("-H");
	CmdArgs.push_back("-sys-header-deps");
	}

	if (D.CCPrintHeaders && !D.CCGenDiagnostics) {
	CmdArgs.push_back("-header-include-file");
	CmdArgs.push_back(D.CCPrintHeadersFilename ? D.CCPrintHeadersFilename
	: "-");
	CmdArgs.push_back("-sys-header-deps");
	}
	Args.AddLastArg(CmdArgs, options::OPT_P);
	Args.AddLastArg(CmdArgs, options::OPT_print_ivar_layout);

	if (D.CCLogDiagnostics && !D.CCGenDiagnostics) {
	CmdArgs.push_back("-diagnostic-log-file");
	CmdArgs.push_back(D.CCLogDiagnosticsFilename ? D.CCLogDiagnosticsFilename
	: "-");
	}

	// Give the gen diagnostics more chances to succeed, by avoiding intentional
	// crashes.
	if (D.CCGenDiagnostics)
	CmdArgs.push_back("-disable-pragma-debug-crash");

	bool UseSeparateSections = isUseSeparateSections(Triple);

	if (Args.hasFlag(options::OPT_ffunction_sections,
	options::OPT_fno_function_sections, UseSeparateSections)) {
	CmdArgs.push_back("-ffunction-sections");
	}

	if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_sections_EQ)) {
	StringRef Val = A->getValue();
	if (Val != "all" && Val != "labels" && Val != "none" &&
	!(Val.startswith("list=") && llvm::sys::fs::exists(Val.substr(5))))
	D.Diag(diag::err_drv_invalid_value)
	<< A->getAsString(Args) << A->getValue();
	else
	A->render(Args, CmdArgs);
	}

	if (Args.hasFlag(options::OPT_fdata_sections, options::OPT_fno_data_sections,
	UseSeparateSections)) {
	CmdArgs.push_back("-fdata-sections");
	}

	if (!Args.hasFlag(options::OPT_funique_section_names,
	options::OPT_fno_unique_section_names, true))
	CmdArgs.push_back("-fno-unique-section-names");

	if (Args.hasFlag(options::OPT_funique_internal_linkage_names,
	options::OPT_fno_unique_internal_linkage_names, false))
	CmdArgs.push_back("-funique-internal-linkage-names");

	if (Args.hasFlag(options::OPT_funique_basic_block_section_names,
	options::OPT_fno_unique_basic_block_section_names, false))
	CmdArgs.push_back("-funique-basic-block-section-names");

	Args.AddLastArg(CmdArgs, options::OPT_finstrument_functions,
	options::OPT_finstrument_functions_after_inlining,
	options::OPT_finstrument_function_entry_bare);

	// NVPTX/AMDGCN doesn't support PGO or coverage. There's no runtime support
	// for sampling, overhead of call arc collection is way too high and there's
	// no way to collect the output.
	if (!Triple.isNVPTX() && !Triple.isAMDGCN())
	addPGOAndCoverageFlags(TC, C, D, Output, Args, CmdArgs);

	Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);

	// Add runtime flag for PS4 when PGO, coverage, or sanitizers are enabled.
	if (RawTriple.isPS4CPU() &&
	!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
	PS4cpu::addProfileRTArgs(TC, Args, CmdArgs);
	PS4cpu::addSanitizerArgs(TC, CmdArgs);
	}

	// Pass options for controlling the default header search paths.
	if (Args.hasArg(options::OPT_nostdinc)) {
	CmdArgs.push_back("-nostdsysteminc");
	CmdArgs.push_back("-nobuiltininc");
	} else {
	if (Args.hasArg(options::OPT_nostdlibinc))
	CmdArgs.push_back("-nostdsysteminc");
	Args.AddLastArg(CmdArgs, options::OPT_nostdincxx);
	Args.AddLastArg(CmdArgs, options::OPT_nobuiltininc);
	}

	// Pass the path to compiler resource files.
	CmdArgs.push_back("-resource-dir");
	CmdArgs.push_back(D.ResourceDir.c_str());

	Args.AddLastArg(CmdArgs, options::OPT_working_directory);

	RenderARCMigrateToolOptions(D, Args, CmdArgs);

	// Add preprocessing options like -I, -D, etc. if we are using the
	// preprocessor.
	//
	// FIXME: Support -fpreprocessed
	if (types::getPreprocessedType(InputType) != types::TY_INVALID)
	AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);

	// Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
	// that "The compiler can only warn and ignore the option if not recognized".
	// When building with ccache, it will pass -D options to clang even on
	// preprocessed inputs and configure concludes that -fPIC is not supported.
	Args.ClaimAllArgs(options::OPT_D);

	// Manually translate -O4 to -O3; let clang reject others.
	if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
	if (A->getOption().matches(options::OPT_O4)) {
	CmdArgs.push_back("-O3");
	D.Diag(diag::warn_O4_is_O3);
	} else {
	A->render(Args, CmdArgs);
	}
	}

	// Warn about ignored options to clang.
	for (const Arg *A :
	Args.filtered(options::OPT_clang_ignored_gcc_optimization_f_Group)) {
	D.Diag(diag::warn_ignored_gcc_optimization) << A->getAsString(Args);
	A->claim();
	}

	for (const Arg *A :
	Args.filtered(options::OPT_clang_ignored_legacy_options_Group)) {
	D.Diag(diag::warn_ignored_clang_option) << A->getAsString(Args);
	A->claim();
	}

	claimNoWarnArgs(Args);

	Args.AddAllArgs(CmdArgs, options::OPT_R_Group);

	Args.AddAllArgs(CmdArgs, options::OPT_W_Group);
	if (Args.hasFlag(options::OPT_pedantic, options::OPT_no_pedantic, false))
	CmdArgs.push_back("-pedantic");
	Args.AddLastArg(CmdArgs, options::OPT_pedantic_errors);
	Args.AddLastArg(CmdArgs, options::OPT_w);

	// Fixed point flags
	if (Args.hasFlag(options::OPT_ffixed_point, options::OPT_fno_fixed_point,
	/Default=/false))
	Args.AddLastArg(CmdArgs, options::OPT_ffixed_point);

	// Handle -{std, ansi, trigraphs} -- take the last of -{std, ansi}
	// (-ansi is equivalent to -std=c89 or -std=c++98).
	//
	// If a std is supplied, only add -trigraphs if it follows the
	// option.
	bool ImplyVCPPCXXVer = false;
	const Arg *Std = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi);
	if (Std) {
	if (Std->getOption().matches(options::OPT_ansi))
	if (types::isCXX(InputType))
	CmdArgs.push_back("-std=c++98");
	else
	CmdArgs.push_back("-std=c89");
	else
	Std->render(Args, CmdArgs);

	// If -f(no-)trigraphs appears after the language standard flag, honor it.
	if (Arg *A = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi,
	options::OPT_ftrigraphs,
	options::OPT_fno_trigraphs))
	if (A != Std)
	A->render(Args, CmdArgs);
	} else {
	// Honor -std-default.
	//
	// FIXME: Clang doesn't correctly handle -std= when the input language
	// doesn't match. For the time being just ignore this for C++ inputs;
	// eventually we want to do all the standard defaulting here instead of
	// splitting it between the driver and clang -cc1.
	if (!types::isCXX(InputType))
	Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ, "-std=",
	/Joined=/true);
	else if (IsWindowsMSVC)
	ImplyVCPPCXXVer = true;

	Args.AddLastArg(CmdArgs, options::OPT_ftrigraphs,
	options::OPT_fno_trigraphs);

	// HIP headers has minimum C++ standard requirements. Therefore set the
	// default language standard.
	if (IsHIP)
	CmdArgs.push_back(IsWindowsMSVC ? "-std=c++14" : "-std=c++11");
	}

	// GCC's behavior for -Wwrite-strings is a bit strange:
	// * In C, this "warning flag" changes the types of string literals from
	// 'char[N]' to 'const char[N]', and thus triggers an unrelated warning
	// for the discarded qualifier.
	// * In C++, this is just a normal warning flag.
	//
	// Implementing this warning correctly in C is hard, so we follow GCC's
	// behavior for now. FIXME: Directly diagnose uses of a string literal as
	// a non-const char* in C, rather than using this crude hack.
	if (!types::isCXX(InputType)) {
	// FIXME: This should behave just like a warning flag, and thus should also
	// respect -Weverything, -Wno-everything, -Werror=write-strings, and so on.
	Arg *WriteStrings =
	Args.getLastArg(options::OPT_Wwrite_strings,
	options::OPT_Wno_write_strings, options::OPT_w);
	if (WriteStrings &&
	WriteStrings->getOption().matches(options::OPT_Wwrite_strings))
	CmdArgs.push_back("-fconst-strings");
	}

	// GCC provides a macro definition '__DEPRECATED' when -Wdeprecated is active
	// during C++ compilation, which it is by default. GCC keeps this define even
	// in the presence of '-w', match this behavior bug-for-bug.
	if (types::isCXX(InputType) &&
	Args.hasFlag(options::OPT_Wdeprecated, options::OPT_Wno_deprecated,
	true)) {
	CmdArgs.push_back("-fdeprecated-macro");
	}

	// Translate GCC's misnamer '-fasm' arguments to '-fgnu-keywords'.
	if (Arg *Asm = Args.getLastArg(options::OPT_fasm, options::OPT_fno_asm)) {
	if (Asm->getOption().matches(options::OPT_fasm))
	CmdArgs.push_back("-fgnu-keywords");
	else
	CmdArgs.push_back("-fno-gnu-keywords");
	}

	if (ShouldDisableDwarfDirectory(Args, TC))
	CmdArgs.push_back("-fno-dwarf-directory-asm");

	if (!ShouldEnableAutolink(Args, TC, JA))
	CmdArgs.push_back("-fno-autolink");

	// Add in -fdebug-compilation-dir if necessary.
	addDebugCompDirArg(Args, CmdArgs, D.getVFS());

	addDebugPrefixMapArg(D, Args, CmdArgs);

	if (Arg *A = Args.getLastArg(options::OPT_ftemplate_depth_,
	options::OPT_ftemplate_depth_EQ)) {
	CmdArgs.push_back("-ftemplate-depth");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_foperator_arrow_depth_EQ)) {
	CmdArgs.push_back("-foperator-arrow-depth");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_depth_EQ)) {
	CmdArgs.push_back("-fconstexpr-depth");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_steps_EQ)) {
	CmdArgs.push_back("-fconstexpr-steps");
	CmdArgs.push_back(A->getValue());
	}

	if (Args.hasArg(options::OPT_fexperimental_new_constant_interpreter))
	CmdArgs.push_back("-fexperimental-new-constant-interpreter");

	if (Arg *A = Args.getLastArg(options::OPT_fbracket_depth_EQ)) {
	CmdArgs.push_back("-fbracket-depth");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_Wlarge_by_value_copy_EQ,
	options::OPT_Wlarge_by_value_copy_def)) {
	if (A->getNumValues()) {
	StringRef bytes = A->getValue();
	CmdArgs.push_back(Args.MakeArgString("-Wlarge-by-value-copy=" + bytes));
	} else
	CmdArgs.push_back("-Wlarge-by-value-copy=64"); // default value
	}

	if (Args.hasArg(options::OPT_relocatable_pch))
	CmdArgs.push_back("-relocatable-pch");

	if (const Arg *A = Args.getLastArg(options::OPT_fcf_runtime_abi_EQ)) {
	static const char *kCFABIs[] = {
	"standalone", "objc", "swift", "swift-5.0", "swift-4.2", "swift-4.1",
	};

	if (find(kCFABIs, StringRef(A->getValue())) == std::end(kCFABIs))
	D.Diag(diag::err_drv_invalid_cf_runtime_abi) << A->getValue();
	else
	A->render(Args, CmdArgs);
	}

	if (Arg *A = Args.getLastArg(options::OPT_fconstant_string_class_EQ)) {
	CmdArgs.push_back("-fconstant-string-class");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_ftabstop_EQ)) {
	CmdArgs.push_back("-ftabstop");
	CmdArgs.push_back(A->getValue());
	}

	if (Args.hasFlag(options::OPT_fstack_size_section,
	options::OPT_fno_stack_size_section, RawTriple.isPS4()))
	CmdArgs.push_back("-fstack-size-section");

	CmdArgs.push_back("-ferror-limit");
	if (Arg *A = Args.getLastArg(options::OPT_ferror_limit_EQ))
	CmdArgs.push_back(A->getValue());
	else
	CmdArgs.push_back("19");

	if (Arg *A = Args.getLastArg(options::OPT_fmacro_backtrace_limit_EQ)) {
	CmdArgs.push_back("-fmacro-backtrace-limit");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_ftemplate_backtrace_limit_EQ)) {
	CmdArgs.push_back("-ftemplate-backtrace-limit");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_backtrace_limit_EQ)) {
	CmdArgs.push_back("-fconstexpr-backtrace-limit");
	CmdArgs.push_back(A->getValue());
	}

	if (Arg *A = Args.getLastArg(options::OPT_fspell_checking_limit_EQ)) {
	CmdArgs.push_back("-fspell-checking-limit");
	CmdArgs.push_back(A->getValue());
	}

	// Pass -fmessage-length=.
	unsigned MessageLength = 0;
	if (Arg *A = Args.getLastArg(options::OPT_fmessage_length_EQ)) {
	StringRef V(A->getValue());
	if (V.getAsInteger(0, MessageLength))
	D.Diag(diag::err_drv_invalid_argument_to_option)
	<< V << A->getOption().getName();
	} else {
	// If -fmessage-length=N was not specified, determine whether this is a
	// terminal and, if so, implicitly define -fmessage-length appropriately.
	MessageLength = llvm::sys::Process::StandardErrColumns();
	}
	if (MessageLength != 0)
	CmdArgs.push_back(
	Args.MakeArgString("-fmessage-length=" + Twine(MessageLength)));

	// -fvisibility= and -fvisibility-ms-compat are of a piece.
	if (const Arg *A = Args.getLastArg(options::OPT_fvisibility_EQ,
	options::OPT_fvisibility_ms_compat)) {
	if (A->getOption().matches(options::OPT_fvisibility_EQ)) {
	CmdArgs.push_back("-fvisibility");
	CmdArgs.push_back(A->getValue());
	} else {
	assert(A->getOption().matches(options::OPT_fvisibility_ms_compat));
	CmdArgs.push_back("-fvisibility");
	CmdArgs.push_back("hidden");
	CmdArgs.push_back("-ftype-visibility");
	CmdArgs.push_back("default");
	}
	}

	Args.AddLastArg(CmdArgs, options::OPT_fvisibility_inlines_hidden);
	Args.AddLastArg(CmdArgs, options::OPT_fvisibility_global_new_delete_hidden);

	Args.AddLastArg(CmdArgs, options::OPT_ftlsmodel_EQ);

	// Forward -f (flag) options which we can pass directly.
	Args.AddLastArg(CmdArgs, options::OPT_femit_all_decls);
	Args.AddLastArg(CmdArgs, options::OPT_fheinous_gnu_extensions);
	Args.AddLastArg(CmdArgs, options::OPT_fdigraphs, options::OPT_fno_digraphs);
	Args.AddLastArg(CmdArgs, options::OPT_fno_operator_names);
	Args.AddLastArg(CmdArgs, options::OPT_femulated_tls,
	options::OPT_fno_emulated_tls);

	// AltiVec-like language extensions aren't relevant for assembling.
	if (!isa<PreprocessJobAction>(JA) \|\| Output.getType() != types::TY_PP_Asm)
	Args.AddLastArg(CmdArgs, options::OPT_fzvector);

	Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_show_template_tree);
	Args.AddLastArg(CmdArgs, options::OPT_fno_elide_type);

	// Forward flags for OpenMP. We don't do this if the current action is an
	// device offloading action other than OpenMP.
	if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
	options::OPT_fno_openmp, false) &&
	(JA.isDeviceOffloading(Action::OFK_None) \|\|
	JA.isDeviceOffloading(Action::OFK_OpenMP))) {
	switch (D.getOpenMPRuntime(Args)) {
	case Driver::OMPRT_OMP:
	case Driver::OMPRT_IOMP5:
	// Clang can generate useful OpenMP code for these two runtime libraries.
	CmdArgs.push_back("-fopenmp");

	// If no option regarding the use of TLS in OpenMP codegeneration is
	// given, decide a default based on the target. Otherwise rely on the
	// options and pass the right information to the frontend.
	if (!Args.hasFlag(options::OPT_fopenmp_use_tls,
	options::OPT_fnoopenmp_use_tls, /Default=/true))
	CmdArgs.push_back("-fnoopenmp-use-tls");
	Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
	options::OPT_fno_openmp_simd);
	Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_enable_irbuilder);
	Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ);
	Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ);
	Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ);
	Args.AddAllArgs(CmdArgs,
	options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ);
	if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse,
	options::OPT_fno_openmp_optimistic_collapse,
	/Default=/false))
	CmdArgs.push_back("-fopenmp-optimistic-collapse");

	// When in OpenMP offloading mode with NVPTX target, forward
	// cuda-mode flag
	if (Args.hasFlag(options::OPT_fopenmp_cuda_mode,
	options::OPT_fno_openmp_cuda_mode, /Default=/false))
	CmdArgs.push_back("-fopenmp-cuda-mode");

	// When in OpenMP offloading mode with NVPTX target, forward
	// cuda-parallel-target-regions flag
	if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions,
	options::OPT_fno_openmp_cuda_parallel_target_regions,
	/Default=/true))
	CmdArgs.push_back("-fopenmp-cuda-parallel-target-regions");

	// When in OpenMP offloading mode with NVPTX target, check if full runtime
	// is required.
	if (Args.hasFlag(options::OPT_fopenmp_cuda_force_full_runtime,
	options::OPT_fno_openmp_cuda_force_full_runtime,
	/Default=/false))
	CmdArgs.push_back("-fopenmp-cuda-force-full-runtime");
	break;
	default:
	// By default, if Clang doesn't know how to generate useful OpenMP code
	// for a specific runtime library, we just don't pass the '-fopenmp' flag
	// down to the actual compilation.
	// FIXME: It would be better to have a mode which only omits IR
	// generation based on the OpenMP support so that we get consistent
	// semantic analysis, etc.
	break;
	}
	} else {
	Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
	options::OPT_fno_openmp_simd);
	Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ);
	}

	const SanitizerArgs &Sanitize = TC.getSanitizerArgs();
	Sanitize.addArgs(TC, Args, CmdArgs, InputType);

	const XRayArgs &XRay = TC.getXRayArgs();
	XRay.addArgs(TC, Args, CmdArgs, InputType);

	if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ)) {
	StringRef S0 = A->getValue(), S = S0;
	unsigned Size, Offset = 0;
	if (!Triple.isAArch64() && Triple.getArch() != llvm::Triple::x86 &&
	Triple.getArch() != llvm::Triple::x86_64)
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getAsString(Args) << TripleStr;
	else if (S.consumeInteger(10, Size) \|\|
	(!S.empty() && (!S.consume_front(",") \|\|
	S.consumeInteger(10, Offset) \|\| !S.empty())))
	D.Diag(diag::err_drv_invalid_argument_to_option)
	<< S0 << A->getOption().getName();
	else if (Size < Offset)
	D.Diag(diag::err_drv_unsupported_fpatchable_function_entry_argument);
	else {
	CmdArgs.push_back(Args.MakeArgString(A->getSpelling() + Twine(Size)));
	CmdArgs.push_back(Args.MakeArgString(
	"-fpatchable-function-entry-offset=" + Twine(Offset)));
	}
	}

	if (TC.SupportsProfiling()) {
	Args.AddLastArg(CmdArgs, options::OPT_pg);

	llvm::Triple::ArchType Arch = TC.getArch();
	if (Arg *A = Args.getLastArg(options::OPT_mfentry)) {
	if (Arch == llvm::Triple::systemz \|\| TC.getTriple().isX86())
	A->render(Args, CmdArgs);
	else
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getAsString(Args) << TripleStr;
	}
	if (Arg *A = Args.getLastArg(options::OPT_mnop_mcount)) {
	if (Arch == llvm::Triple::systemz)
	A->render(Args, CmdArgs);
	else
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getAsString(Args) << TripleStr;
	}
	if (Arg *A = Args.getLastArg(options::OPT_mrecord_mcount)) {
	if (Arch == llvm::Triple::systemz)
	A->render(Args, CmdArgs);
	else
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< A->getAsString(Args) << TripleStr;
	}
	}

	if (Args.getLastArg(options::OPT_fapple_kext) \|\|
	(Args.hasArg(options::OPT_mkernel) && types::isCXX(InputType)))
	CmdArgs.push_back("-fapple-kext");

	Args.AddLastArg(CmdArgs, options::OPT_flax_vector_conversions_EQ);
	Args.AddLastArg(CmdArgs, options::OPT_fobjc_sender_dependent_dispatch);
	Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_print_source_range_info);
	Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_parseable_fixits);
	Args.AddLastArg(CmdArgs, options::OPT_ftime_report);
	Args.AddLastArg(CmdArgs, options::OPT_ftime_trace);
	Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_granularity_EQ);
	Args.AddLastArg(CmdArgs, options::OPT_ftrapv);
	Args.AddLastArg(CmdArgs, options::OPT_malign_double);
	Args.AddLastArg(CmdArgs, options::OPT_fno_temp_file);

	if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) {
	CmdArgs.push_back("-ftrapv-handler");
	CmdArgs.push_back(A->getValue());
	}

	Args.AddLastArg(CmdArgs, options::OPT_ftrap_function_EQ);

	// -fno-strict-overflow implies -fwrapv if it isn't disabled, but
	// -fstrict-overflow won't turn off an explicitly enabled -fwrapv.
	if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) {
	if (A->getOption().matches(options::OPT_fwrapv))
	CmdArgs.push_back("-fwrapv");
	} else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow,
	options::OPT_fno_strict_overflow)) {
	if (A->getOption().matches(options::OPT_fno_strict_overflow))
	CmdArgs.push_back("-fwrapv");
	}

	if (Arg *A = Args.getLastArg(options::OPT_freroll_loops,
	options::OPT_fno_reroll_loops))
	if (A->getOption().matches(options::OPT_freroll_loops))
	CmdArgs.push_back("-freroll-loops");

	Args.AddLastArg(CmdArgs, options::OPT_fwritable_strings);
	Args.AddLastArg(CmdArgs, options::OPT_funroll_loops,
	options::OPT_fno_unroll_loops);

	Args.AddLastArg(CmdArgs, options::OPT_pthread);

	if (Args.hasFlag(options::OPT_mspeculative_load_hardening,
	options::OPT_mno_speculative_load_hardening, false))
	CmdArgs.push_back(Args.MakeArgString("-mspeculative-load-hardening"));

	RenderSSPOptions(TC, Args, CmdArgs, KernelOrKext);
	RenderSCPOptions(TC, Args, CmdArgs);
	RenderTrivialAutoVarInitOptions(D, TC, Args, CmdArgs);

	// Translate -mstackrealign
	if (Args.hasFlag(options::OPT_mstackrealign, options::OPT_mno_stackrealign,
	false))
	CmdArgs.push_back(Args.MakeArgString("-mstackrealign"));

	if (Args.hasArg(options::OPT_mstack_alignment)) {
	StringRef alignment = Args.getLastArgValue(options::OPT_mstack_alignment);
	CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment));
	}

	if (Args.hasArg(options::OPT_mstack_probe_size)) {
	StringRef Size = Args.getLastArgValue(options::OPT_mstack_probe_size);

	if (!Size.empty())
	CmdArgs.push_back(Args.MakeArgString("-mstack-probe-size=" + Size));
	else
	CmdArgs.push_back("-mstack-probe-size=0");
	}

	if (!Args.hasFlag(options::OPT_mstack_arg_probe,
	options::OPT_mno_stack_arg_probe, true))
	CmdArgs.push_back(Args.MakeArgString("-mno-stack-arg-probe"));

	if (Arg *A = Args.getLastArg(options::OPT_mrestrict_it,
	options::OPT_mno_restrict_it)) {
	if (A->getOption().matches(options::OPT_mrestrict_it)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-arm-restrict-it");
	} else {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-arm-no-restrict-it");
	}
	} else if (Triple.isOSWindows() &&
	(Triple.getArch() == llvm::Triple::arm \|\|
	Triple.getArch() == llvm::Triple::thumb)) {
	// Windows on ARM expects restricted IT blocks
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-arm-restrict-it");
	}

	// Forward -cl options to -cc1
	RenderOpenCLOptions(Args, CmdArgs);

	if (IsHIP && Args.hasFlag(options::OPT_fhip_new_launch_api,
	options::OPT_fno_hip_new_launch_api, true))
	CmdArgs.push_back("-fhip-new-launch-api");

	if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-fcf-protection=") + A->getValue()));
	}

	// Forward -f options with positive and negative forms; we translate
	// these by hand.
	if (Arg *A = getLastProfileSampleUseArg(Args)) {
	auto *PGOArg = Args.getLastArg(
	options::OPT_fprofile_generate, options::OPT_fprofile_generate_EQ,
	options::OPT_fcs_profile_generate, options::OPT_fcs_profile_generate_EQ,
	options::OPT_fprofile_use, options::OPT_fprofile_use_EQ);
	if (PGOArg)
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< "SampleUse with PGO options";

	StringRef fname = A->getValue();
	if (!llvm::sys::fs::exists(fname))
	D.Diag(diag::err_drv_no_such_file) << fname;
	else
	A->render(Args, CmdArgs);
	}
	Args.AddLastArg(CmdArgs, options::OPT_fprofile_remapping_file_EQ);

	RenderBuiltinOptions(TC, RawTriple, Args, CmdArgs);

	if (!Args.hasFlag(options::OPT_fassume_sane_operator_new,
	options::OPT_fno_assume_sane_operator_new))
	CmdArgs.push_back("-fno-assume-sane-operator-new");

	// -fblocks=0 is default.
	if (Args.hasFlag(options::OPT_fblocks, options::OPT_fno_blocks,
	TC.IsBlocksDefault()) \|\|
	(Args.hasArg(options::OPT_fgnu_runtime) &&
	Args.hasArg(options::OPT_fobjc_nonfragile_abi) &&
	!Args.hasArg(options::OPT_fno_blocks))) {
	CmdArgs.push_back("-fblocks");

	if (!Args.hasArg(options::OPT_fgnu_runtime) && !TC.hasBlocksRuntime())
	CmdArgs.push_back("-fblocks-runtime-optional");
	}

	// -fencode-extended-block-signature=1 is default.
	if (TC.IsEncodeExtendedBlockSignatureDefault())
	CmdArgs.push_back("-fencode-extended-block-signature");

	if (Args.hasFlag(options::OPT_fcoroutines_ts, options::OPT_fno_coroutines_ts,
	false) &&
	types::isCXX(InputType)) {
	CmdArgs.push_back("-fcoroutines-ts");
	}

	Args.AddLastArg(CmdArgs, options::OPT_fdouble_square_bracket_attributes,
	options::OPT_fno_double_square_bracket_attributes);

	// -faccess-control is default.
	if (Args.hasFlag(options::OPT_fno_access_control,
	options::OPT_faccess_control, false))
	CmdArgs.push_back("-fno-access-control");

	// -felide-constructors is the default.
	if (Args.hasFlag(options::OPT_fno_elide_constructors,
	options::OPT_felide_constructors, false))
	CmdArgs.push_back("-fno-elide-constructors");

	ToolChain::RTTIMode RTTIMode = TC.getRTTIMode();

	if (KernelOrKext \|\| (types::isCXX(InputType) &&
	(RTTIMode == ToolChain::RM_Disabled)))
	CmdArgs.push_back("-fno-rtti");

	// -fshort-enums=0 is default for all architectures except Hexagon.
	if (Args.hasFlag(options::OPT_fshort_enums, options::OPT_fno_short_enums,
	TC.getArch() == llvm::Triple::hexagon))
	CmdArgs.push_back("-fshort-enums");

	RenderCharacterOptions(Args, AuxTriple ? *AuxTriple : RawTriple, CmdArgs);

	// -fuse-cxa-atexit is default.
	if (!Args.hasFlag(
	options::OPT_fuse_cxa_atexit, options::OPT_fno_use_cxa_atexit,
	!RawTriple.isOSAIX() && !RawTriple.isOSWindows() &&
	TC.getArch() != llvm::Triple::xcore &&
	((RawTriple.getVendor() != llvm::Triple::MipsTechnologies) \|\|
	RawTriple.hasEnvironment())) \|\|
	KernelOrKext)
	CmdArgs.push_back("-fno-use-cxa-atexit");

	if (Args.hasFlag(options::OPT_fregister_global_dtors_with_atexit,
	options::OPT_fno_register_global_dtors_with_atexit,
	RawTriple.isOSDarwin() && !KernelOrKext))
	CmdArgs.push_back("-fregister-global-dtors-with-atexit");

	// -fno-use-line-directives is default.
	if (Args.hasFlag(options::OPT_fuse_line_directives,
	options::OPT_fno_use_line_directives, false))
	CmdArgs.push_back("-fuse-line-directives");

	// -fms-extensions=0 is default.
	if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
	IsWindowsMSVC))
	CmdArgs.push_back("-fms-extensions");

	// -fms-compatibility=0 is default.
	bool IsMSVCCompat = Args.hasFlag(
	options::OPT_fms_compatibility, options::OPT_fno_ms_compatibility,
	(IsWindowsMSVC && Args.hasFlag(options::OPT_fms_extensions,
	options::OPT_fno_ms_extensions, true)));
	if (IsMSVCCompat)
	CmdArgs.push_back("-fms-compatibility");

	// Handle -fgcc-version, if present.
	VersionTuple GNUCVer;
	if (Arg *A = Args.getLastArg(options::OPT_fgnuc_version_EQ)) {
	// Check that the version has 1 to 3 components and the minor and patch
	// versions fit in two decimal digits.
	StringRef Val = A->getValue();
	Val = Val.empty() ? "0" : Val; // Treat "" as 0 or disable.
	bool Invalid = GNUCVer.tryParse(Val);
	unsigned Minor = GNUCVer.getMinor().getValueOr(0);
	unsigned Patch = GNUCVer.getSubminor().getValueOr(0);
	if (Invalid \|\| GNUCVer.getBuild() \|\| Minor >= 100 \|\| Patch >= 100) {
	D.Diag(diag::err_drv_invalid_value)
	<< A->getAsString(Args) << A->getValue();
	}
	} else if (!IsMSVCCompat) {
	// Imitate GCC 4.2.1 by default if -fms-compatibility is not in effect.
	GNUCVer = VersionTuple(4, 2, 1);
	}
	if (!GNUCVer.empty()) {
	CmdArgs.push_back(
	Args.MakeArgString("-fgnuc-version=" + GNUCVer.getAsString()));
	}

	VersionTuple MSVT = TC.computeMSVCVersion(&D, Args);
	if (!MSVT.empty())
	CmdArgs.push_back(
	Args.MakeArgString("-fms-compatibility-version=" + MSVT.getAsString()));

	bool IsMSVC2015Compatible = MSVT.getMajor() >= 19;
	if (ImplyVCPPCXXVer) {
	StringRef LanguageStandard;
	if (const Arg *StdArg = Args.getLastArg(options::OPT__SLASH_std)) {
	Std = StdArg;
	LanguageStandard = llvm::StringSwitch<StringRef>(StdArg->getValue())
	.Case("c++14", "-std=c++14")
	.Case("c++17", "-std=c++17")
	.Case("c++latest", "-std=c++2a")
	.Default("");
	if (LanguageStandard.empty())
	D.Diag(clang::diag::warn_drv_unused_argument)
	<< StdArg->getAsString(Args);
	}

	if (LanguageStandard.empty()) {
	if (IsMSVC2015Compatible)
	LanguageStandard = "-std=c++14";
	else
	LanguageStandard = "-std=c++11";
	}

	CmdArgs.push_back(LanguageStandard.data());
	}

	// -fno-borland-extensions is default.
	if (Args.hasFlag(options::OPT_fborland_extensions,
	options::OPT_fno_borland_extensions, false))
	CmdArgs.push_back("-fborland-extensions");

	// -fno-declspec is default, except for PS4.
	if (Args.hasFlag(options::OPT_fdeclspec, options::OPT_fno_declspec,
	RawTriple.isPS4()))
	CmdArgs.push_back("-fdeclspec");
	else if (Args.hasArg(options::OPT_fno_declspec))
	CmdArgs.push_back("-fno-declspec"); // Explicitly disabling __declspec.

	// -fthreadsafe-static is default, except for MSVC compatibility versions less
	// than 19.
	if (!Args.hasFlag(options::OPT_fthreadsafe_statics,
	options::OPT_fno_threadsafe_statics,
	!IsWindowsMSVC \|\| IsMSVC2015Compatible))
	CmdArgs.push_back("-fno-threadsafe-statics");

	// -fno-delayed-template-parsing is default, except when targeting MSVC.
	// Many old Windows SDK versions require this to parse.
	// FIXME: MSVC introduced /Zc:twoPhase- to disable this behavior in their
	// compiler. We should be able to disable this by default at some point.
	if (Args.hasFlag(options::OPT_fdelayed_template_parsing,
	options::OPT_fno_delayed_template_parsing, IsWindowsMSVC))
	CmdArgs.push_back("-fdelayed-template-parsing");

	// -fgnu-keywords default varies depending on language; only pass if
	// specified.
	Args.AddLastArg(CmdArgs, options::OPT_fgnu_keywords,
	options::OPT_fno_gnu_keywords);

	if (Args.hasFlag(options::OPT_fgnu89_inline, options::OPT_fno_gnu89_inline,
	false))
	CmdArgs.push_back("-fgnu89-inline");

	if (Args.hasArg(options::OPT_fno_inline))
	CmdArgs.push_back("-fno-inline");

	Args.AddLastArg(CmdArgs, options::OPT_finline_functions,
	options::OPT_finline_hint_functions,
	options::OPT_fno_inline_functions);

	// FIXME: Find a better way to determine whether the language has modules
	// support by default, or just assume that all languages do.
	bool HaveModules =
	Std && (Std->containsValue("c++2a") \|\| Std->containsValue("c++latest"));
	RenderModulesOptions(C, D, Args, Input, Output, CmdArgs, HaveModules);

	if (Args.hasFlag(options::OPT_fpch_validate_input_files_content,
	options::OPT_fno_pch_validate_input_files_content, false))
	CmdArgs.push_back("-fvalidate-ast-input-files-content");
	if (Args.hasFlag(options::OPT_fpch_instantiate_templates,
	options::OPT_fno_pch_instantiate_templates, false))
	CmdArgs.push_back("-fpch-instantiate-templates");
	if (Args.hasFlag(options::OPT_fpch_codegen, options::OPT_fno_pch_codegen,
	false))
	CmdArgs.push_back("-fmodules-codegen");
	if (Args.hasFlag(options::OPT_fpch_debuginfo, options::OPT_fno_pch_debuginfo,
	false))
	CmdArgs.push_back("-fmodules-debuginfo");

	Args.AddLastArg(CmdArgs, options::OPT_fexperimental_new_pass_manager,
	options::OPT_fno_experimental_new_pass_manager);

	ObjCRuntime Runtime = AddObjCRuntimeArgs(Args, Inputs, CmdArgs, rewriteKind);
	RenderObjCOptions(TC, D, RawTriple, Args, Runtime, rewriteKind != RK_None,
	Input, CmdArgs);

	if (Args.hasFlag(options::OPT_fapplication_extension,
	options::OPT_fno_application_extension, false))
	CmdArgs.push_back("-fapplication-extension");

	// Handle GCC-style exception args.
	if (!C.getDriver().IsCLMode())
	addExceptionArgs(Args, InputType, TC, KernelOrKext, Runtime, CmdArgs);

	// Handle exception personalities
	Arg *A = Args.getLastArg(
	options::OPT_fsjlj_exceptions, options::OPT_fseh_exceptions,
	options::OPT_fdwarf_exceptions, options::OPT_fwasm_exceptions);
	if (A) {
	const Option &Opt = A->getOption();
	if (Opt.matches(options::OPT_fsjlj_exceptions))
	CmdArgs.push_back("-fsjlj-exceptions");
	if (Opt.matches(options::OPT_fseh_exceptions))
	CmdArgs.push_back("-fseh-exceptions");
	if (Opt.matches(options::OPT_fdwarf_exceptions))
	CmdArgs.push_back("-fdwarf-exceptions");
	if (Opt.matches(options::OPT_fwasm_exceptions))
	CmdArgs.push_back("-fwasm-exceptions");
	} else {
	switch (TC.GetExceptionModel(Args)) {
	default:
	break;
	case llvm::ExceptionHandling::DwarfCFI:
	CmdArgs.push_back("-fdwarf-exceptions");
	break;
	case llvm::ExceptionHandling::SjLj:
	CmdArgs.push_back("-fsjlj-exceptions");
	break;
	case llvm::ExceptionHandling::WinEH:
	CmdArgs.push_back("-fseh-exceptions");
	break;
	}
	}

	// C++ "sane" operator new.
	if (!Args.hasFlag(options::OPT_fassume_sane_operator_new,
	options::OPT_fno_assume_sane_operator_new))
	CmdArgs.push_back("-fno-assume-sane-operator-new");

	// -frelaxed-template-template-args is off by default, as it is a severe
	// breaking change until a corresponding change to template partial ordering
	// is provided.
	if (Args.hasFlag(options::OPT_frelaxed_template_template_args,
	options::OPT_fno_relaxed_template_template_args, false))
	CmdArgs.push_back("-frelaxed-template-template-args");

	// -fsized-deallocation is off by default, as it is an ABI-breaking change for
	// most platforms.
	if (Args.hasFlag(options::OPT_fsized_deallocation,
	options::OPT_fno_sized_deallocation, false))
	CmdArgs.push_back("-fsized-deallocation");

	// -faligned-allocation is on by default in C++17 onwards and otherwise off
	// by default.
	if (Arg *A = Args.getLastArg(options::OPT_faligned_allocation,
	options::OPT_fno_aligned_allocation,
	options::OPT_faligned_new_EQ)) {
	if (A->getOption().matches(options::OPT_fno_aligned_allocation))
	CmdArgs.push_back("-fno-aligned-allocation");
	else
	CmdArgs.push_back("-faligned-allocation");
	}

	// The default new alignment can be specified using a dedicated option or via
	// a GCC-compatible option that also turns on aligned allocation.
	if (Arg *A = Args.getLastArg(options::OPT_fnew_alignment_EQ,
	options::OPT_faligned_new_EQ))
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-fnew-alignment=") + A->getValue()));

	// -fconstant-cfstrings is default, and may be subject to argument translation
	// on Darwin.
	if (!Args.hasFlag(options::OPT_fconstant_cfstrings,
	options::OPT_fno_constant_cfstrings) \|\|
	!Args.hasFlag(options::OPT_mconstant_cfstrings,
	options::OPT_mno_constant_cfstrings))
	CmdArgs.push_back("-fno-constant-cfstrings");

	// -fno-pascal-strings is default, only pass non-default.
	if (Args.hasFlag(options::OPT_fpascal_strings,
	options::OPT_fno_pascal_strings, false))
	CmdArgs.push_back("-fpascal-strings");

	// Honor -fpack-struct= and -fpack-struct, if given. Note that
	// -fno-pack-struct doesn't apply to -fpack-struct=.
	if (Arg *A = Args.getLastArg(options::OPT_fpack_struct_EQ)) {
	std::string PackStructStr = "-fpack-struct=";
	PackStructStr += A->getValue();
	CmdArgs.push_back(Args.MakeArgString(PackStructStr));
	} else if (Args.hasFlag(options::OPT_fpack_struct,
	options::OPT_fno_pack_struct, false)) {
	CmdArgs.push_back("-fpack-struct=1");
	}

	// Handle -fmax-type-align=N and -fno-type-align
	bool SkipMaxTypeAlign = Args.hasArg(options::OPT_fno_max_type_align);
	if (Arg *A = Args.getLastArg(options::OPT_fmax_type_align_EQ)) {
	if (!SkipMaxTypeAlign) {
	std::string MaxTypeAlignStr = "-fmax-type-align=";
	MaxTypeAlignStr += A->getValue();
	CmdArgs.push_back(Args.MakeArgString(MaxTypeAlignStr));
	}
	} else if (RawTriple.isOSDarwin()) {
	if (!SkipMaxTypeAlign) {
	std::string MaxTypeAlignStr = "-fmax-type-align=16";
	CmdArgs.push_back(Args.MakeArgString(MaxTypeAlignStr));
	}
	}

	if (!Args.hasFlag(options::OPT_Qy, options::OPT_Qn, true))
	CmdArgs.push_back("-Qn");

	// -fno-common is the default, set -fcommon only when that flag is set.
	if (Args.hasFlag(options::OPT_fcommon, options::OPT_fno_common, false))
	CmdArgs.push_back("-fcommon");

	// -fsigned-bitfields is default, and clang doesn't yet support
	// -funsigned-bitfields.
	if (!Args.hasFlag(options::OPT_fsigned_bitfields,
	options::OPT_funsigned_bitfields))
	D.Diag(diag::warn_drv_clang_unsupported)
	<< Args.getLastArg(options::OPT_funsigned_bitfields)->getAsString(Args);

	// -fsigned-bitfields is default, and clang doesn't support -fno-for-scope.
	if (!Args.hasFlag(options::OPT_ffor_scope, options::OPT_fno_for_scope))
	D.Diag(diag::err_drv_clang_unsupported)
	<< Args.getLastArg(options::OPT_fno_for_scope)->getAsString(Args);

	// -finput_charset=UTF-8 is default. Reject others
	if (Arg *inputCharset = Args.getLastArg(options::OPT_finput_charset_EQ)) {
	StringRef value = inputCharset->getValue();
	if (!value.equals_lower("utf-8"))
	D.Diag(diag::err_drv_invalid_value) << inputCharset->getAsString(Args)
	<< value;
	}

	// -fexec_charset=UTF-8 is default. Reject others
	if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
	StringRef value = execCharset->getValue();
	if (!value.equals_lower("utf-8"))
	D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
	<< value;
	}

	RenderDiagnosticsOptions(D, Args, CmdArgs);

	// -fno-asm-blocks is default.
	if (Args.hasFlag(options::OPT_fasm_blocks, options::OPT_fno_asm_blocks,
	false))
	CmdArgs.push_back("-fasm-blocks");

	// -fgnu-inline-asm is default.
	if (!Args.hasFlag(options::OPT_fgnu_inline_asm,
	options::OPT_fno_gnu_inline_asm, true))
	CmdArgs.push_back("-fno-gnu-inline-asm");

	// Enable vectorization per default according to the optimization level
	// selected. For optimization levels that want vectorization we use the alias
	// option to simplify the hasFlag logic.
	bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false);
	OptSpecifier VectorizeAliasOption =
	EnableVec ? options::OPT_O_Group : options::OPT_fvectorize;
	if (Args.hasFlag(options::OPT_fvectorize, VectorizeAliasOption,
	options::OPT_fno_vectorize, EnableVec))
	CmdArgs.push_back("-vectorize-loops");

	// -fslp-vectorize is enabled based on the optimization level selected.
	bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true);
	OptSpecifier SLPVectAliasOption =
	EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize;
	if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption,
	options::OPT_fno_slp_vectorize, EnableSLPVec))
	CmdArgs.push_back("-vectorize-slp");

	ParseMPreferVectorWidth(D, Args, CmdArgs);

	Args.AddLastArg(CmdArgs, options::OPT_fshow_overloads_EQ);
	Args.AddLastArg(CmdArgs,
	options::OPT_fsanitize_undefined_strip_path_components_EQ);

	// -fdollars-in-identifiers default varies depending on platform and
	// language; only pass if specified.
	if (Arg *A = Args.getLastArg(options::OPT_fdollars_in_identifiers,
	options::OPT_fno_dollars_in_identifiers)) {
	if (A->getOption().matches(options::OPT_fdollars_in_identifiers))
	CmdArgs.push_back("-fdollars-in-identifiers");
	else
	CmdArgs.push_back("-fno-dollars-in-identifiers");
	}

	// -funit-at-a-time is default, and we don't support -fno-unit-at-a-time for
	// practical purposes.
	if (Arg *A = Args.getLastArg(options::OPT_funit_at_a_time,
	options::OPT_fno_unit_at_a_time)) {
	if (A->getOption().matches(options::OPT_fno_unit_at_a_time))
	D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args);
	}

	if (Args.hasFlag(options::OPT_fapple_pragma_pack,
	options::OPT_fno_apple_pragma_pack, false))
	CmdArgs.push_back("-fapple-pragma-pack");

	// Remarks can be enabled with any of the `-f.optimization-record.` flags.
	if (willEmitRemarks(Args) && checkRemarksOptions(D, Args, Triple))
	renderRemarksOptions(Args, CmdArgs, Triple, Input, Output, JA);

	bool RewriteImports = Args.hasFlag(options::OPT_frewrite_imports,
	options::OPT_fno_rewrite_imports, false);
	if (RewriteImports)
	CmdArgs.push_back("-frewrite-imports");

	// Enable rewrite includes if the user's asked for it or if we're generating
	// diagnostics.
	// TODO: Once -module-dependency-dir works with -frewrite-includes it'd be
	// nice to enable this when doing a crashdump for modules as well.
	if (Args.hasFlag(options::OPT_frewrite_includes,
	options::OPT_fno_rewrite_includes, false) \|\|
	(C.isForDiagnostics() && !HaveModules))
	CmdArgs.push_back("-frewrite-includes");

	// Only allow -traditional or -traditional-cpp outside in preprocessing modes.
	if (Arg *A = Args.getLastArg(options::OPT_traditional,
	options::OPT_traditional_cpp)) {
	if (isa<PreprocessJobAction>(JA))
	CmdArgs.push_back("-traditional-cpp");
	else
	D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args);
	}

	Args.AddLastArg(CmdArgs, options::OPT_dM);
	Args.AddLastArg(CmdArgs, options::OPT_dD);

	Args.AddLastArg(CmdArgs, options::OPT_fmax_tokens_EQ);

	// Handle serialized diagnostics.
	if (Arg *A = Args.getLastArg(options::OPT__serialize_diags)) {
	CmdArgs.push_back("-serialize-diagnostic-file");
	CmdArgs.push_back(Args.MakeArgString(A->getValue()));
	}

	if (Args.hasArg(options::OPT_fretain_comments_from_system_headers))
	CmdArgs.push_back("-fretain-comments-from-system-headers");

	// Forward -fcomment-block-commands to -cc1.
	Args.AddAllArgs(CmdArgs, options::OPT_fcomment_block_commands);
	// Forward -fparse-all-comments to -cc1.
	Args.AddAllArgs(CmdArgs, options::OPT_fparse_all_comments);

	// Turn -fplugin=name.so into -load name.so
	for (const Arg *A : Args.filtered(options::OPT_fplugin_EQ)) {
	CmdArgs.push_back("-load");
	CmdArgs.push_back(A->getValue());
	A->claim();
	}

	// Forward -fpass-plugin=name.so to -cc1.
	for (const Arg *A : Args.filtered(options::OPT_fpass_plugin_EQ)) {
	CmdArgs.push_back(
	Args.MakeArgString(Twine("-fpass-plugin=") + A->getValue()));
	A->claim();
	}

	// Setup statistics file output.
	SmallString<128> StatsFile = getStatsFileName(Args, Output, Input, D);
	if (!StatsFile.empty())
	CmdArgs.push_back(Args.MakeArgString(Twine("-stats-file=") + StatsFile));

	// Forward -Xclang arguments to -cc1, and -mllvm arguments to the LLVM option
	// parser.
	// -finclude-default-header flag is for preprocessor,
	// do not pass it to other cc1 commands when save-temps is enabled
	if (C.getDriver().isSaveTempsEnabled() &&
	!isa<PreprocessJobAction>(JA)) {
	for (auto Arg : Args.filtered(options::OPT_Xclang)) {
	Arg->claim();
	if (StringRef(Arg->getValue()) != "-finclude-default-header")
	CmdArgs.push_back(Arg->getValue());
	}
	}
	else {
	Args.AddAllArgValues(CmdArgs, options::OPT_Xclang);
	}
	for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
	A->claim();

	// We translate this by hand to the -cc1 argument, since nightly test uses
	// it and developers have been trained to spell it with -mllvm. Both
	// spellings are now deprecated and should be removed.
	if (StringRef(A->getValue(0)) == "-disable-llvm-optzns") {
	CmdArgs.push_back("-disable-llvm-optzns");
	} else {
	A->render(Args, CmdArgs);
	}
	}

	// With -save-temps, we want to save the unoptimized bitcode output from the
	// CompileJobAction, use -disable-llvm-passes to get pristine IR generated
	// by the frontend.
	// When -fembed-bitcode is enabled, optimized bitcode is emitted because it
	// has slightly different breakdown between stages.
	// FIXME: -fembed-bitcode -save-temps will save optimized bitcode instead of
	// pristine IR generated by the frontend. Ideally, a new compile action should
	// be added so both IR can be captured.
	if ((C.getDriver().isSaveTempsEnabled() \|\|
	JA.isHostOffloading(Action::OFK_OpenMP)) &&
	!(C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO()) &&
	isa<CompileJobAction>(JA))
	CmdArgs.push_back("-disable-llvm-passes");

	Args.AddAllArgs(CmdArgs, options::OPT_undef);

	const char *Exec = D.getClangProgramPath();

	// Optionally embed the -cc1 level arguments into the debug info or a
	// section, for build analysis.
	// Also record command line arguments into the debug info if
	// -grecord-gcc-switches options is set on.
	// By default, -gno-record-gcc-switches is set on and no recording.
	auto GRecordSwitches =
	Args.hasFlag(options::OPT_grecord_command_line,
	options::OPT_gno_record_command_line, false);
	auto FRecordSwitches =
	Args.hasFlag(options::OPT_frecord_command_line,
	options::OPT_fno_record_command_line, false);
	if (FRecordSwitches && !Triple.isOSBinFormatELF())
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< Args.getLastArg(options::OPT_frecord_command_line)->getAsString(Args)
	<< TripleStr;
	if (TC.UseDwarfDebugFlags() \|\| GRecordSwitches \|\| FRecordSwitches) {
	ArgStringList OriginalArgs;
	for (const auto &Arg : Args)
	Arg->render(Args, OriginalArgs);

	SmallString<256> Flags;
	EscapeSpacesAndBackslashes(Exec, Flags);
	for (const char *OriginalArg : OriginalArgs) {
	SmallString<128> EscapedArg;
	EscapeSpacesAndBackslashes(OriginalArg, EscapedArg);
	Flags += " ";
	Flags += EscapedArg;
	}
	auto FlagsArgString = Args.MakeArgString(Flags);
	if (TC.UseDwarfDebugFlags() \|\| GRecordSwitches) {
	CmdArgs.push_back("-dwarf-debug-flags");
	CmdArgs.push_back(FlagsArgString);
	}
	if (FRecordSwitches) {
	CmdArgs.push_back("-record-command-line");
	CmdArgs.push_back(FlagsArgString);
	}
	}

	// Host-side cuda compilation receives all device-side outputs in a single
	// fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
	if ((IsCuda \|\| IsHIP) && CudaDeviceInput) {
	CmdArgs.push_back("-fcuda-include-gpubinary");
	CmdArgs.push_back(CudaDeviceInput->getFilename());
	if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
	CmdArgs.push_back("-fgpu-rdc");
	}

	if (IsCuda) {
	if (Args.hasFlag(options::OPT_fcuda_short_ptr,
	options::OPT_fno_cuda_short_ptr, false))
	CmdArgs.push_back("-fcuda-short-ptr");
	}

	if (IsHIP)
	CmdArgs.push_back("-fcuda-allow-variadic-functions");

	// OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
	// to specify the result of the compile phase on the host, so the meaningful
	// device declarations can be identified. Also, -fopenmp-is-device is passed
	// along to tell the frontend that it is generating code for a device, so that
	// only the relevant declarations are emitted.
	if (IsOpenMPDevice) {
	CmdArgs.push_back("-fopenmp-is-device");
	if (OpenMPDeviceInput) {
	CmdArgs.push_back("-fopenmp-host-ir-file-path");
	CmdArgs.push_back(Args.MakeArgString(OpenMPDeviceInput->getFilename()));
	}
	}

	// For all the host OpenMP offloading compile jobs we need to pass the targets
	// information using -fopenmp-targets= option.
	if (JA.isHostOffloading(Action::OFK_OpenMP)) {
	SmallString<128> TargetInfo("-fopenmp-targets=");

	Arg *Tgts = Args.getLastArg(options::OPT_fopenmp_targets_EQ);
	assert(Tgts && Tgts->getNumValues() &&
	"OpenMP offloading has to have targets specified.");
	for (unsigned i = 0; i < Tgts->getNumValues(); ++i) {
	if (i)
	TargetInfo += ',';
	// We need to get the string from the triple because it may be not exactly
	// the same as the one we get directly from the arguments.
	llvm::Triple T(Tgts->getValue(i));
	TargetInfo += T.getTriple();
	}
	CmdArgs.push_back(Args.MakeArgString(TargetInfo.str()));
	}

	bool VirtualFunctionElimination =
	Args.hasFlag(options::OPT_fvirtual_function_elimination,
	options::OPT_fno_virtual_function_elimination, false);
	if (VirtualFunctionElimination) {
	// VFE requires full LTO (currently, this might be relaxed to allow ThinLTO
	// in the future).
	if (D.getLTOMode() != LTOK_Full)
	D.Diag(diag::err_drv_argument_only_allowed_with)
	<< "-fvirtual-function-elimination"
	<< "-flto=full";

	CmdArgs.push_back("-fvirtual-function-elimination");
	}

	// VFE requires whole-program-vtables, and enables it by default.
	bool WholeProgramVTables = Args.hasFlag(
	options::OPT_fwhole_program_vtables,
	options::OPT_fno_whole_program_vtables, VirtualFunctionElimination);
	if (VirtualFunctionElimination && !WholeProgramVTables) {
	D.Diag(diag::err_drv_argument_not_allowed_with)
	<< "-fno-whole-program-vtables"
	<< "-fvirtual-function-elimination";
	}

	if (WholeProgramVTables) {
	if (!D.isUsingLTO())
	D.Diag(diag::err_drv_argument_only_allowed_with)
	<< "-fwhole-program-vtables"
	<< "-flto";
	CmdArgs.push_back("-fwhole-program-vtables");
	}

	bool DefaultsSplitLTOUnit =
	(WholeProgramVTables \|\| Sanitize.needsLTO()) &&
	(D.getLTOMode() == LTOK_Full \|\| TC.canSplitThinLTOUnit());
	bool SplitLTOUnit =
	Args.hasFlag(options::OPT_fsplit_lto_unit,
	options::OPT_fno_split_lto_unit, DefaultsSplitLTOUnit);
	if (Sanitize.needsLTO() && !SplitLTOUnit)
	D.Diag(diag::err_drv_argument_not_allowed_with) << "-fno-split-lto-unit"
	<< "-fsanitize=cfi";
	if (SplitLTOUnit)
	CmdArgs.push_back("-fsplit-lto-unit");

	if (Arg *A = Args.getLastArg(options::OPT_fglobal_isel,
	options::OPT_fno_global_isel)) {
	CmdArgs.push_back("-mllvm");
	if (A->getOption().matches(options::OPT_fglobal_isel)) {
	CmdArgs.push_back("-global-isel=1");

	// GISel is on by default on AArch64 -O0, so don't bother adding
	// the fallback remarks for it. Other combinations will add a warning of
	// some kind.
	bool IsArchSupported = Triple.getArch() == llvm::Triple::aarch64;
	bool IsOptLevelSupported = false;

	Arg *A = Args.getLastArg(options::OPT_O_Group);
	if (Triple.getArch() == llvm::Triple::aarch64) {
	if (!A \|\| A->getOption().matches(options::OPT_O0))
	IsOptLevelSupported = true;
	}
	if (!IsArchSupported \|\| !IsOptLevelSupported) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-global-isel-abort=2");

	if (!IsArchSupported)
	D.Diag(diag::warn_drv_global_isel_incomplete) << Triple.getArchName();
	else
	D.Diag(diag::warn_drv_global_isel_incomplete_opt);
	}
	} else {
	CmdArgs.push_back("-global-isel=0");
	}
	}

	if (Args.hasArg(options::OPT_forder_file_instrumentation)) {
	CmdArgs.push_back("-forder-file-instrumentation");
	// Enable order file instrumentation when ThinLTO is not on. When ThinLTO is
	// on, we need to pass these flags as linker flags and that will be handled
	// outside of the compiler.
	if (!D.isUsingLTO()) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-enable-order-file-instrumentation");
	}
	}

	if (Arg *A = Args.getLastArg(options::OPT_fforce_enable_int128,
	options::OPT_fno_force_enable_int128)) {
	if (A->getOption().matches(options::OPT_fforce_enable_int128))
	CmdArgs.push_back("-fforce-enable-int128");
	}

	if (Args.hasFlag(options::OPT_fkeep_static_consts,
	options::OPT_fno_keep_static_consts, false))
	CmdArgs.push_back("-fkeep-static-consts");

	if (Args.hasFlag(options::OPT_fcomplete_member_pointers,
	options::OPT_fno_complete_member_pointers, false))
	CmdArgs.push_back("-fcomplete-member-pointers");

	if (!Args.hasFlag(options::OPT_fcxx_static_destructors,
	options::OPT_fno_cxx_static_destructors, true))
	CmdArgs.push_back("-fno-c++-static-destructors");

	if (Arg *A = Args.getLastArg(options::OPT_moutline,
	options::OPT_mno_outline)) {
	if (A->getOption().matches(options::OPT_moutline)) {
	// We only support -moutline in AArch64 and ARM targets right now. If
	// we're not compiling for these, emit a warning and ignore the flag.
	// Otherwise, add the proper mllvm flags.
	if (!(Triple.isARM() \|\| Triple.isThumb() \|\|
	Triple.getArch() == llvm::Triple::aarch64 \|\|
	Triple.getArch() == llvm::Triple::aarch64_32)) {
	D.Diag(diag::warn_drv_moutline_unsupported_opt) << Triple.getArchName();
	} else {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-enable-machine-outliner");
	}
	} else {
	// Disable all outlining behaviour.
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-enable-machine-outliner=never");
	}
	}

	if (Args.hasFlag(options::OPT_faddrsig, options::OPT_fno_addrsig,
	(TC.getTriple().isOSBinFormatELF() \|\|
	TC.getTriple().isOSBinFormatCOFF()) &&
	!TC.getTriple().isPS4() &&
	!TC.getTriple().isOSNetBSD() &&
	!Distro(D.getVFS(), TC.getTriple()).IsGentoo() &&
	!TC.getTriple().isAndroid() &&
	TC.useIntegratedAs()))
	CmdArgs.push_back("-faddrsig");

	if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) {
	std::string Str = A->getAsString(Args);
	if (!TC.getTriple().isOSBinFormatELF())
	D.Diag(diag::err_drv_unsupported_opt_for_target)
	<< Str << TC.getTripleString();
	CmdArgs.push_back(Args.MakeArgString(Str));
	}

	// Add the "-o out -x type src.c" flags last. This is done primarily to make
	// the -cc1 command easier to edit when reproducing compiler crashes.
	if (Output.getType() == types::TY_Dependencies) {
	// Handled with other dependency code.
	} else if (Output.isFilename()) {
	if (Output.getType() == clang::driver::types::TY_IFS_CPP \|\|
	Output.getType() == clang::driver::types::TY_IFS) {
	SmallString<128> OutputFilename(Output.getFilename());
	llvm::sys::path::replace_extension(OutputFilename, "ifs");
	CmdArgs.push_back("-o");
	CmdArgs.push_back(Args.MakeArgString(OutputFilename));
	} else {
	CmdArgs.push_back("-o");
	CmdArgs.push_back(Output.getFilename());
	}
	} else {
	assert(Output.isNothing() && "Invalid output.");
	}

	addDashXForInput(Args, Input, CmdArgs);

	ArrayRef<InputInfo> FrontendInputs = Input;
	if (IsHeaderModulePrecompile)
	FrontendInputs = ModuleHeaderInputs;
	else if (Input.isNothing())
	FrontendInputs = {};

	for (const InputInfo &Input : FrontendInputs) {
	if (Input.isFilename())
	CmdArgs.push_back(Input.getFilename());
	else
	Input.getInputArg().renderAsInput(Args, CmdArgs);
	}

	// Finally add the compile command to the compilation.
	if (Args.hasArg(options::OPT__SLASH_fallback) &&
	Output.getType() == types::TY_Object &&
	(InputType == types::TY_C \|\| InputType == types::TY_CXX)) {
	auto CLCommand =
	getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput);
	C.addCommand(std::make_unique<FallbackCommand>(
	JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs,
	std::move(CLCommand)));
	} else if (Args.hasArg(options::OPT__SLASH_fallback) &&
	isa<PrecompileJobAction>(JA)) {
	// In /fallback builds, run the main compilation even if the pch generation
	// fails, so that the main compilation's fallback to cl.exe runs.
	C.addCommand(std::make_unique<ForceSuccessCommand>(
	JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
	} else if (D.CC1Main && !D.CCGenDiagnostics) {
	// Invoke the CC1 directly in this process
	C.addCommand(std::make_unique<CC1Command>(
	JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
	} else {
	C.addCommand(std::make_unique<Command>(
	JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
	}

	// Make the compile command echo its inputs for /showFilenames.
	if (Output.getType() == types::TY_Object &&
	Args.hasFlag(options::OPT__SLASH_showFilenames,
	options::OPT__SLASH_showFilenames_, false)) {
	C.getJobs().getJobs().back()->PrintInputFilenames = true;
	}

	if (Arg *A = Args.getLastArg(options::OPT_pg))
	if (FPKeepKind == CodeGenOptions::FramePointerKind::None &&
	!Args.hasArg(options::OPT_mfentry))
	D.Diag(diag::err_drv_argument_not_allowed_with) << "-fomit-frame-pointer"
	<< A->getAsString(Args);

	// Claim some arguments which clang supports automatically.

	// -fpch-preprocess is used with gcc to add a special marker in the output to
	// include the PCH file.
	Args.ClaimAllArgs(options::OPT_fpch_preprocess);

	// Claim some arguments which clang doesn't support, but we don't
	// care to warn the user about.
	Args.ClaimAllArgs(options::OPT_clang_ignored_f_Group);
	Args.ClaimAllArgs(options::OPT_clang_ignored_m_Group);

	// Disable warnings for clang -E -emit-llvm foo.c
	Args.ClaimAllArgs(options::OPT_emit_llvm);
	}

	Clang::Clang(const ToolChain &TC)
	// CAUTION! The first constructor argument ("clang") is not arbitrary,
	// as it is for other tools. Some operations on a Tool actually test
	// whether that tool is Clang based on the Tool's Name as a string.
	: Tool("clang", "clang frontend", TC) {}

	Clang::~Clang() {}

	/// Add options related to the Objective-C runtime/ABI.
	///
	/// Returns true if the runtime is non-fragile.
	ObjCRuntime Clang::AddObjCRuntimeArgs(const ArgList &args,
	const InputInfoList &inputs,
	ArgStringList &cmdArgs,
	RewriteKind rewriteKind) const {
	// Look for the controlling runtime option.
	Arg *runtimeArg =
	args.getLastArg(options::OPT_fnext_runtime, options::OPT_fgnu_runtime,
	options::OPT_fobjc_runtime_EQ);

	// Just forward -fobjc-runtime= to the frontend. This supercedes
	// options about fragility.
	if (runtimeArg &&
	runtimeArg->getOption().matches(options::OPT_fobjc_runtime_EQ)) {
	ObjCRuntime runtime;
	StringRef value = runtimeArg->getValue();
	if (runtime.tryParse(value)) {
	getToolChain().getDriver().Diag(diag::err_drv_unknown_objc_runtime)
	<< value;
	}
	if ((runtime.getKind() == ObjCRuntime::GNUstep) &&
	(runtime.getVersion() >= VersionTuple(2, 0)))
	if (!getToolChain().getTriple().isOSBinFormatELF() &&
	!getToolChain().getTriple().isOSBinFormatCOFF()) {
	getToolChain().getDriver().Diag(
	diag::err_drv_gnustep_objc_runtime_incompatible_binary)
	<< runtime.getVersion().getMajor();
	}

	runtimeArg->render(args, cmdArgs);
	return runtime;
	}

	// Otherwise, we'll need the ABI "version". Version numbers are
	// slightly confusing for historical reasons:
	// 1 - Traditional "fragile" ABI
	// 2 - Non-fragile ABI, version 1
	// 3 - Non-fragile ABI, version 2
	unsigned objcABIVersion = 1;
	// If -fobjc-abi-version= is present, use that to set the version.
	if (Arg *abiArg = args.getLastArg(options::OPT_fobjc_abi_version_EQ)) {
	StringRef value = abiArg->getValue();
	if (value == "1")
	objcABIVersion = 1;
	else if (value == "2")
	objcABIVersion = 2;
	else if (value == "3")
	objcABIVersion = 3;
	else
	getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported) << value;
	} else {
	// Otherwise, determine if we are using the non-fragile ABI.
	bool nonFragileABIIsDefault =
	(rewriteKind == RK_NonFragile \|\|
	(rewriteKind == RK_None &&
	getToolChain().IsObjCNonFragileABIDefault()));
	if (args.hasFlag(options::OPT_fobjc_nonfragile_abi,
	options::OPT_fno_objc_nonfragile_abi,
	nonFragileABIIsDefault)) {
	// Determine the non-fragile ABI version to use.
	#ifdef DISABLE_DEFAULT_NONFRAGILEABI_TWO
	unsigned nonFragileABIVersion = 1;
	#else
	unsigned nonFragileABIVersion = 2;
	#endif

	if (Arg *abiArg =
	args.getLastArg(options::OPT_fobjc_nonfragile_abi_version_EQ)) {
	StringRef value = abiArg->getValue();
	if (value == "1")
	nonFragileABIVersion = 1;
	else if (value == "2")
	nonFragileABIVersion = 2;
	else
	getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported)
	<< value;
	}

	objcABIVersion = 1 + nonFragileABIVersion;
	} else {
	objcABIVersion = 1;
	}
	}

	// We don't actually care about the ABI version other than whether
	// it's non-fragile.
	bool isNonFragile = objcABIVersion != 1;

	// If we have no runtime argument, ask the toolchain for its default runtime.
	// However, the rewriter only really supports the Mac runtime, so assume that.
	ObjCRuntime runtime;
	if (!runtimeArg) {
	switch (rewriteKind) {
	case RK_None:
	runtime = getToolChain().getDefaultObjCRuntime(isNonFragile);
	break;
	case RK_Fragile:
	runtime = ObjCRuntime(ObjCRuntime::FragileMacOSX, VersionTuple());
	break;
	case RK_NonFragile:
	runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple());
	break;
	}

	// -fnext-runtime
	} else if (runtimeArg->getOption().matches(options::OPT_fnext_runtime)) {
	// On Darwin, make this use the default behavior for the toolchain.
	if (getToolChain().getTriple().isOSDarwin()) {
	runtime = getToolChain().getDefaultObjCRuntime(isNonFragile);

	// Otherwise, build for a generic macosx port.
	} else {
	runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple());
	}

	// -fgnu-runtime
	} else {
	assert(runtimeArg->getOption().matches(options::OPT_fgnu_runtime));
	// Legacy behaviour is to target the gnustep runtime if we are in
	// non-fragile mode or the GCC runtime in fragile mode.
	if (isNonFragile)
	runtime = ObjCRuntime(ObjCRuntime::GNUstep, VersionTuple(2, 0));
	else
	runtime = ObjCRuntime(ObjCRuntime::GCC, VersionTuple());
	}

	if (llvm::any_of(inputs, [](const InputInfo &input) {
	return types::isObjC(input.getType());
	}))
	cmdArgs.push_back(
	args.MakeArgString("-fobjc-runtime=" + runtime.getAsString()));
	return runtime;
	}

	static bool maybeConsumeDash(const std::string &EH, size_t &I) {
	bool HaveDash = (I + 1 < EH.size() && EH[I + 1] == '-');
	I += HaveDash;
	return !HaveDash;
	}

	namespace {
	struct EHFlags {
	bool Synch = false;
	bool Asynch = false;
	bool NoUnwindC = false;
	};
	} // end anonymous namespace

	/// /EH controls whether to run destructor cleanups when exceptions are
	/// thrown. There are three modifiers:
	/// - s: Cleanup after "synchronous" exceptions, aka C++ exceptions.
	/// - a: Cleanup after "asynchronous" exceptions, aka structured exceptions.
	/// The 'a' modifier is unimplemented and fundamentally hard in LLVM IR.
	/// - c: Assume that extern "C" functions are implicitly nounwind.
	/// The default is /EHs-c-, meaning cleanups are disabled.
	static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
	EHFlags EH;

	std::vector<std::string> EHArgs =
	Args.getAllArgValues(options::OPT__SLASH_EH);
	for (auto EHVal : EHArgs) {
	for (size_t I = 0, E = EHVal.size(); I != E; ++I) {
	switch (EHVal[I]) {
	case 'a':
	EH.Asynch = maybeConsumeDash(EHVal, I);
	if (EH.Asynch)
	EH.Synch = false;
	continue;
	case 'c':
	EH.NoUnwindC = maybeConsumeDash(EHVal, I);
	continue;
	case 's':
	EH.Synch = maybeConsumeDash(EHVal, I);
	if (EH.Synch)
	EH.Asynch = false;
	continue;
	default:
	break;
	}
	D.Diag(clang::diag::err_drv_invalid_value) << "/EH" << EHVal;
	break;
	}
	}
	// The /GX, /GX- flags are only processed if there are not /EH flags.
	// The default is that /GX is not specified.
	if (EHArgs.empty() &&
	Args.hasFlag(options::OPT__SLASH_GX, options::OPT__SLASH_GX_,
	/Default=/false)) {
	EH.Synch = true;
	EH.NoUnwindC = true;
	}

	return EH;
	}

	void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
	ArgStringList &CmdArgs,
	codegenoptions::DebugInfoKind *DebugInfoKind,
	bool *EmitCodeView) const {
	unsigned RTOptionID = options::OPT__SLASH_MT;
	bool isNVPTX = getToolChain().getTriple().isNVPTX();

	if (Args.hasArg(options::OPT__SLASH_LDd))
	// The /LDd option implies /MTd. The dependent lib part can be overridden,
	// but defining _DEBUG is sticky.
	RTOptionID = options::OPT__SLASH_MTd;

	if (Arg *A = Args.getLastArg(options::OPT__SLASH_M_Group))
	RTOptionID = A->getOption().getID();

	StringRef FlagForCRT;
	switch (RTOptionID) {
	case options::OPT__SLASH_MD:
	if (Args.hasArg(options::OPT__SLASH_LDd))
	CmdArgs.push_back("-D_DEBUG");
	CmdArgs.push_back("-D_MT");
	CmdArgs.push_back("-D_DLL");
	FlagForCRT = "--dependent-lib=msvcrt";
	break;
	case options::OPT__SLASH_MDd:
	CmdArgs.push_back("-D_DEBUG");
	CmdArgs.push_back("-D_MT");
	CmdArgs.push_back("-D_DLL");
	FlagForCRT = "--dependent-lib=msvcrtd";
	break;
	case options::OPT__SLASH_MT:
	if (Args.hasArg(options::OPT__SLASH_LDd))
	CmdArgs.push_back("-D_DEBUG");
	CmdArgs.push_back("-D_MT");
	CmdArgs.push_back("-flto-visibility-public-std");
	FlagForCRT = "--dependent-lib=libcmt";
	break;
	case options::OPT__SLASH_MTd:
	CmdArgs.push_back("-D_DEBUG");
	CmdArgs.push_back("-D_MT");
	CmdArgs.push_back("-flto-visibility-public-std");
	FlagForCRT = "--dependent-lib=libcmtd";
	break;
	default:
	llvm_unreachable("Unexpected option ID.");
	}

	if (Args.hasArg(options::OPT__SLASH_Zl)) {
	CmdArgs.push_back("-D_VC_NODEFAULTLIB");
	} else {
	CmdArgs.push_back(FlagForCRT.data());

	// This provides POSIX compatibility (maps 'open' to '_open'), which most
	// users want. The /Za flag to cl.exe turns this off, but it's not
	// implemented in clang.
	CmdArgs.push_back("--dependent-lib=oldnames");
	}

	if (Arg *ShowIncludes =
	Args.getLastArg(options::OPT__SLASH_showIncludes,
	options::OPT__SLASH_showIncludes_user)) {
	CmdArgs.push_back("--show-includes");
	if (ShowIncludes->getOption().matches(options::OPT__SLASH_showIncludes))
	CmdArgs.push_back("-sys-header-deps");
	}

	// This controls whether or not we emit RTTI data for polymorphic types.
	if (Args.hasFlag(options::OPT__SLASH_GR_, options::OPT__SLASH_GR,
	/Default=/false))
	CmdArgs.push_back("-fno-rtti-data");

	// This controls whether or not we emit stack-protector instrumentation.
	// In MSVC, Buffer Security Check (/GS) is on by default.
	if (!isNVPTX && Args.hasFlag(options::OPT__SLASH_GS, options::OPT__SLASH_GS_,
	/Default=/true)) {
	CmdArgs.push_back("-stack-protector");
	CmdArgs.push_back(Args.MakeArgString(Twine(LangOptions::SSPStrong)));
	}

	// Emit CodeView if -Z7, -Zd, or -gline-tables-only are present.
	if (Arg *DebugInfoArg =
	Args.getLastArg(options::OPT__SLASH_Z7, options::OPT__SLASH_Zd,
	options::OPT_gline_tables_only)) {
	*EmitCodeView = true;
	if (DebugInfoArg->getOption().matches(options::OPT__SLASH_Z7))
	*DebugInfoKind = codegenoptions::DebugInfoConstructor;
	else
	*DebugInfoKind = codegenoptions::DebugLineTablesOnly;
	} else {
	*EmitCodeView = false;
	}

	const Driver &D = getToolChain().getDriver();
	EHFlags EH = parseClangCLEHFlags(D, Args);
	if (!isNVPTX && (EH.Synch \|\| EH.Asynch)) {
	if (types::isCXX(InputType))
	CmdArgs.push_back("-fcxx-exceptions");
	CmdArgs.push_back("-fexceptions");
	}
	if (types::isCXX(InputType) && EH.Synch && EH.NoUnwindC)
	CmdArgs.push_back("-fexternc-nounwind");

	// /EP should expand to -E -P.
	if (Args.hasArg(options::OPT__SLASH_EP)) {
	CmdArgs.push_back("-E");
	CmdArgs.push_back("-P");
	}

	unsigned VolatileOptionID;
	if (getToolChain().getTriple().isX86())
	VolatileOptionID = options::OPT__SLASH_volatile_ms;
	else
	VolatileOptionID = options::OPT__SLASH_volatile_iso;

	if (Arg *A = Args.getLastArg(options::OPT__SLASH_volatile_Group))
	VolatileOptionID = A->getOption().getID();

	if (VolatileOptionID == options::OPT__SLASH_volatile_ms)
	CmdArgs.push_back("-fms-volatile");

	if (Args.hasFlag(options::OPT__SLASH_Zc_dllexportInlines_,
	options::OPT__SLASH_Zc_dllexportInlines,
	false)) {
	if (Args.hasArg(options::OPT__SLASH_fallback)) {
	D.Diag(clang::diag::err_drv_dllexport_inlines_and_fallback);
	} else {
	CmdArgs.push_back("-fno-dllexport-inlines");
	}
	}

	Arg *MostGeneralArg = Args.getLastArg(options::OPT__SLASH_vmg);
	Arg *BestCaseArg = Args.getLastArg(options::OPT__SLASH_vmb);
	if (MostGeneralArg && BestCaseArg)
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< MostGeneralArg->getAsString(Args) << BestCaseArg->getAsString(Args);

	if (MostGeneralArg) {
	Arg *SingleArg = Args.getLastArg(options::OPT__SLASH_vms);
	Arg *MultipleArg = Args.getLastArg(options::OPT__SLASH_vmm);
	Arg *VirtualArg = Args.getLastArg(options::OPT__SLASH_vmv);

	Arg *FirstConflict = SingleArg ? SingleArg : MultipleArg;
	Arg *SecondConflict = VirtualArg ? VirtualArg : MultipleArg;
	if (FirstConflict && SecondConflict && FirstConflict != SecondConflict)
	D.Diag(clang::diag::err_drv_argument_not_allowed_with)
	<< FirstConflict->getAsString(Args)
	<< SecondConflict->getAsString(Args);

	if (SingleArg)
	CmdArgs.push_back("-fms-memptr-rep=single");
	else if (MultipleArg)
	CmdArgs.push_back("-fms-memptr-rep=multiple");
	else
	CmdArgs.push_back("-fms-memptr-rep=virtual");
	}

	// Parse the default calling convention options.
	if (Arg *CCArg =
	Args.getLastArg(options::OPT__SLASH_Gd, options::OPT__SLASH_Gr,
	options::OPT__SLASH_Gz, options::OPT__SLASH_Gv,
	options::OPT__SLASH_Gregcall)) {
	unsigned DCCOptId = CCArg->getOption().getID();
	const char *DCCFlag = nullptr;
	bool ArchSupported = !isNVPTX;
	llvm::Triple::ArchType Arch = getToolChain().getArch();
	switch (DCCOptId) {
	case options::OPT__SLASH_Gd:
	DCCFlag = "-fdefault-calling-conv=cdecl";
	break;
	case options::OPT__SLASH_Gr:
	ArchSupported = Arch == llvm::Triple::x86;
	DCCFlag = "-fdefault-calling-conv=fastcall";
	break;
	case options::OPT__SLASH_Gz:
	ArchSupported = Arch == llvm::Triple::x86;
	DCCFlag = "-fdefault-calling-conv=stdcall";
	break;
	case options::OPT__SLASH_Gv:
	ArchSupported = Arch == llvm::Triple::x86 \|\| Arch == llvm::Triple::x86_64;
	DCCFlag = "-fdefault-calling-conv=vectorcall";
	break;
	case options::OPT__SLASH_Gregcall:
	ArchSupported = Arch == llvm::Triple::x86 \|\| Arch == llvm::Triple::x86_64;
	DCCFlag = "-fdefault-calling-conv=regcall";
	break;
	}

	// MSVC doesn't warn if /Gr or /Gz is used on x64, so we don't either.
	if (ArchSupported && DCCFlag)
	CmdArgs.push_back(DCCFlag);
	}

	Args.AddLastArg(CmdArgs, options::OPT_vtordisp_mode_EQ);

	if (!Args.hasArg(options::OPT_fdiagnostics_format_EQ)) {
	CmdArgs.push_back("-fdiagnostics-format");
	if (Args.hasArg(options::OPT__SLASH_fallback))
	CmdArgs.push_back("msvc-fallback");
	else
	CmdArgs.push_back("msvc");
	}

	if (Arg *A = Args.getLastArg(options::OPT__SLASH_guard)) {
	StringRef GuardArgs = A->getValue();
	// The only valid options are "cf", "cf,nochecks", and "cf-".
	if (GuardArgs.equals_lower("cf")) {
	// Emit CFG instrumentation and the table of address-taken functions.
	CmdArgs.push_back("-cfguard");
	} else if (GuardArgs.equals_lower("cf,nochecks")) {
	// Emit only the table of address-taken functions.
	CmdArgs.push_back("-cfguard-no-checks");
	} else if (GuardArgs.equals_lower("cf-")) {
	// Do nothing, but we might want to emit a security warning in future.
	} else {
	D.Diag(diag::err_drv_invalid_value) << A->getSpelling() << GuardArgs;
	}
	}
	}

	visualstudio::Compiler *Clang::getCLFallback() const {
	if (!CLFallback)
	CLFallback.reset(new visualstudio::Compiler(getToolChain()));
	return CLFallback.get();
	}


	const char *Clang::getBaseInputName(const ArgList &Args,
	const InputInfo &Input) {
	return Args.MakeArgString(llvm::sys::path::filename(Input.getBaseInput()));
	}

	const char *Clang::getBaseInputStem(const ArgList &Args,
	const InputInfoList &Inputs) {
	const char *Str = getBaseInputName(Args, Inputs[0]);

	if (const char *End = strrchr(Str, '.'))
	return Args.MakeArgString(std::string(Str, End));

	return Str;
	}

	const char *Clang::getDependencyFileName(const ArgList &Args,
	const InputInfoList &Inputs) {
	// FIXME: Think about this more.

	if (Arg *OutputOpt = Args.getLastArg(options::OPT_o)) {
	SmallString<128> OutputFilename(OutputOpt->getValue());
	llvm::sys::path::replace_extension(OutputFilename, llvm::Twine('d'));
	return Args.MakeArgString(OutputFilename);
	}

	return Args.MakeArgString(Twine(getBaseInputStem(Args, Inputs)) + ".d");
	}

	// Begin ClangAs

	void ClangAs::AddMIPSTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	StringRef CPUName;
	StringRef ABIName;
	const llvm::Triple &Triple = getToolChain().getTriple();
	mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName);

	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName.data());
	}

	void ClangAs::AddX86TargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	addX86AlignBranchArgs(getToolChain().getDriver(), Args, CmdArgs,
	/IsLTO=/false);

	if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
	StringRef Value = A->getValue();
	if (Value == "intel" \|\| Value == "att") {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
	} else {
	getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Value;
	}
	}
	}

	void ClangAs::AddRISCVTargetArgs(const ArgList &Args,
	ArgStringList &CmdArgs) const {
	const llvm::Triple &Triple = getToolChain().getTriple();
	StringRef ABIName = riscv::getRISCVABI(Args, Triple);

	CmdArgs.push_back("-target-abi");
	CmdArgs.push_back(ABIName.data());
	}

	void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
	const InputInfo &Output, const InputInfoList &Inputs,
	const ArgList &Args,
	const char *LinkingOutput) const {
	ArgStringList CmdArgs;

	assert(Inputs.size() == 1 && "Unexpected number of inputs.");
	const InputInfo &Input = Inputs[0];

	const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
	const std::string &TripleStr = Triple.getTriple();
	const auto &D = getToolChain().getDriver();

	// Don't warn about "clang -w -c foo.s"
	Args.ClaimAllArgs(options::OPT_w);
	// and "clang -emit-llvm -c foo.s"
	Args.ClaimAllArgs(options::OPT_emit_llvm);

	claimNoWarnArgs(Args);

	// Invoke ourselves in -cc1as mode.
	//
	// FIXME: Implement custom jobs for internal actions.
	CmdArgs.push_back("-cc1as");

	// Add the "effective" target triple.
	CmdArgs.push_back("-triple");
	CmdArgs.push_back(Args.MakeArgString(TripleStr));

	// Set the output mode, we currently only expect to be used as a real
	// assembler.
	CmdArgs.push_back("-filetype");
	CmdArgs.push_back("obj");

	// Set the main file name, so that debug info works even with
	// -save-temps or preprocessed assembly.
	CmdArgs.push_back("-main-file-name");
	CmdArgs.push_back(Clang::getBaseInputName(Args, Input));

	// Add the target cpu
	std::string CPU = getCPUName(Args, Triple, /FromAs/ true);
	if (!CPU.empty()) {
	CmdArgs.push_back("-target-cpu");
	CmdArgs.push_back(Args.MakeArgString(CPU));
	}

	// Add the target features
	getTargetFeatures(D, Triple, Args, CmdArgs, true);

	// Ignore explicit -force_cpusubtype_ALL option.
	(void)Args.hasArg(options::OPT_force__cpusubtype__ALL);

	// Pass along any -I options so we get proper .include search paths.
	Args.AddAllArgs(CmdArgs, options::OPT_I_Group);

	// Determine the original source input.
	const Action *SourceAction = &JA;
	while (SourceAction->getKind() != Action::InputClass) {
	assert(!SourceAction->getInputs().empty() && "unexpected root action!");
	SourceAction = SourceAction->getInputs()[0];
	}

	// Forward -g and handle debug info related flags, assuming we are dealing
	// with an actual assembly file.
	bool WantDebug = false;
	unsigned DwarfVersion = 0;
	Args.ClaimAllArgs(options::OPT_g_Group);
	if (Arg *A = Args.getLastArg(options::OPT_g_Group)) {
	WantDebug = !A->getOption().matches(options::OPT_g0) &&
	!A->getOption().matches(options::OPT_ggdb0);
	if (WantDebug)
	DwarfVersion = DwarfVersionNum(A->getSpelling());
	}

	unsigned DefaultDwarfVersion = ParseDebugDefaultVersion(getToolChain(), Args);
	if (DwarfVersion == 0)
	DwarfVersion = DefaultDwarfVersion;

	if (DwarfVersion == 0)
	DwarfVersion = getToolChain().GetDefaultDwarfVersion();

	codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;

	if (SourceAction->getType() == types::TY_Asm \|\|
	SourceAction->getType() == types::TY_PP_Asm) {
	// You might think that it would be ok to set DebugInfoKind outside of
	// the guard for source type, however there is a test which asserts
	// that some assembler invocation receives no -debug-info-kind,
	// and it's not clear whether that test is just overly restrictive.
	DebugInfoKind = (WantDebug ? codegenoptions::DebugInfoConstructor
	: codegenoptions::NoDebugInfo);
	// Add the -fdebug-compilation-dir flag if needed.
	addDebugCompDirArg(Args, CmdArgs, C.getDriver().getVFS());

	addDebugPrefixMapArg(getToolChain().getDriver(), Args, CmdArgs);

	// Set the AT_producer to the clang version when using the integrated
	// assembler on assembly source files.
	CmdArgs.push_back("-dwarf-debug-producer");
	CmdArgs.push_back(Args.MakeArgString(getClangFullVersion()));

	// And pass along -I options
	Args.AddAllArgs(CmdArgs, options::OPT_I);
	}
	RenderDebugEnablingArgs(Args, CmdArgs, DebugInfoKind, DwarfVersion,
	llvm::DebuggerKind::Default);
	RenderDebugInfoCompressionArgs(Args, CmdArgs, D, getToolChain());


	// Handle -fPIC et al -- the relocation-model affects the assembler
	// for some targets.
	llvm::Reloc::Model RelocationModel;
	unsigned PICLevel;
	bool IsPIE;
	std::tie(RelocationModel, PICLevel, IsPIE) =
	ParsePICArgs(getToolChain(), Args);

	const char *RMName = RelocationModelName(RelocationModel);
	if (RMName) {
	CmdArgs.push_back("-mrelocation-model");
	CmdArgs.push_back(RMName);
	}

	// Optionally embed the -cc1as level arguments into the debug info, for build
	// analysis.
	if (getToolChain().UseDwarfDebugFlags()) {
	ArgStringList OriginalArgs;
	for (const auto &Arg : Args)
	Arg->render(Args, OriginalArgs);

	SmallString<256> Flags;
	const char *Exec = getToolChain().getDriver().getClangProgramPath();
	EscapeSpacesAndBackslashes(Exec, Flags);
	for (const char *OriginalArg : OriginalArgs) {
	SmallString<128> EscapedArg;
	EscapeSpacesAndBackslashes(OriginalArg, EscapedArg);
	Flags += " ";
	Flags += EscapedArg;
	}
	CmdArgs.push_back("-dwarf-debug-flags");
	CmdArgs.push_back(Args.MakeArgString(Flags));
	}

	// FIXME: Add -static support, once we have it.

	// Add target specific flags.
	switch (getToolChain().getArch()) {
	default:
	break;

	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	AddMIPSTargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	AddX86TargetArgs(Args, CmdArgs);
	break;

	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	// This isn't in AddARMTargetArgs because we want to do this for assembly
	// only, not C/C++.
	if (Args.hasFlag(options::OPT_mdefault_build_attributes,
	options::OPT_mno_default_build_attributes, true)) {
	CmdArgs.push_back("-mllvm");
	CmdArgs.push_back("-arm-add-build-attributes");
	}
	break;

	case llvm::Triple::riscv32:
	case llvm::Triple::riscv64:
	AddRISCVTargetArgs(Args, CmdArgs);
	break;
	}

	// Consume all the warning flags. Usually this would be handled more
	// gracefully by -cc1 (warning about unknown warning flags, etc) but -cc1as
	// doesn't handle that so rather than warning about unused flags that are
	// actually used, we'll lie by omission instead.
	// FIXME: Stop lying and consume only the appropriate driver flags
	Args.ClaimAllArgs(options::OPT_W_Group);

	CollectArgsForIntegratedAssembler(C, Args, CmdArgs,
	getToolChain().getDriver());

	Args.AddAllArgs(CmdArgs, options::OPT_mllvm);

	assert(Output.isFilename() && "Unexpected lipo output.");
	CmdArgs.push_back("-o");
	CmdArgs.push_back(Output.getFilename());

	const llvm::Triple &T = getToolChain().getTriple();
	Arg *A;
	if (getDebugFissionKind(D, Args, A) == DwarfFissionKind::Split &&
	T.isOSBinFormatELF()) {
	CmdArgs.push_back("-split-dwarf-output");
	CmdArgs.push_back(SplitDebugName(Args, Input, Output));
	}

	assert(Input.isFilename() && "Invalid input.");
	CmdArgs.push_back(Input.getFilename());

	const char *Exec = getToolChain().getDriver().getClangProgramPath();
	C.addCommand(std::make_unique<Command>(
	JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
	}

	// Begin OffloadBundler

	void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
	const InputInfo &Output,
	const InputInfoList &Inputs,
	const llvm::opt::ArgList &TCArgs,
	const char *LinkingOutput) const {
	// The version with only one output is expected to refer to a bundling job.
	assert(isa<OffloadBundlingJobAction>(JA) && "Expecting bundling job!");

	// The bundling command looks like this:
	// clang-offload-bundler -type=bc
	// -targets=host-triple,openmp-triple1,openmp-triple2
	// -outputs=input_file
	// -inputs=unbundle_file_host,unbundle_file_tgt1,unbundle_file_tgt2"

	ArgStringList CmdArgs;

	// Get the type.
	CmdArgs.push_back(TCArgs.MakeArgString(
	Twine("-type=") + types::getTypeTempSuffix(Output.getType())));

	assert(JA.getInputs().size() == Inputs.size() &&
	"Not have inputs for all dependence actions??");

	// Get the targets.
	SmallString<128> Triples;
	Triples += "-targets=";
	for (unsigned I = 0; I < Inputs.size(); ++I) {
	if (I)
	Triples += ',';

	// Find ToolChain for this input.
	Action::OffloadKind CurKind = Action::OFK_Host;
	const ToolChain *CurTC = &getToolChain();
	const Action *CurDep = JA.getInputs()[I];

	if (const auto *OA = dyn_cast<OffloadAction>(CurDep)) {
	CurTC = nullptr;
	OA->doOnEachDependence([&](Action A, const ToolChain TC, const char *) {
	assert(CurTC == nullptr && "Expected one dependence!");
	CurKind = A->getOffloadingDeviceKind();
	CurTC = TC;
	});
	}
	Triples += Action::GetOffloadKindName(CurKind);
	Triples += '-';
	Triples += CurTC->getTriple().normalize();
	if (CurKind == Action::OFK_HIP && CurDep->getOffloadingArch()) {
	Triples += '-';
	Triples += CurDep->getOffloadingArch();
	}
	}
	CmdArgs.push_back(TCArgs.MakeArgString(Triples));

	// Get bundled file command.
	CmdArgs.push_back(
	TCArgs.MakeArgString(Twine("-outputs=") + Output.getFilename()));

	// Get unbundled files command.
	SmallString<128> UB;
	UB += "-inputs=";
	for (unsigned I = 0; I < Inputs.size(); ++I) {
	if (I)
	UB += ',';

	// Find ToolChain for this input.
	const ToolChain *CurTC = &getToolChain();
	if (const auto *OA = dyn_cast<OffloadAction>(JA.getInputs()[I])) {
	CurTC = nullptr;
	OA->doOnEachDependence([&](Action , const ToolChain TC, const char *) {
	assert(CurTC == nullptr && "Expected one dependence!");
	CurTC = TC;
	});
	}
	UB += CurTC->getInputFilename(Inputs[I]);
	}
	CmdArgs.push_back(TCArgs.MakeArgString(UB));

	// All the inputs are encoded as commands.
	C.addCommand(std::make_unique<Command>(
	JA, *this, ResponseFileSupport::None(),
	TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
	CmdArgs, None));
	}

	void OffloadBundler::ConstructJobMultipleOutputs(
	Compilation &C, const JobAction &JA, const InputInfoList &Outputs,
	const InputInfoList &Inputs, const llvm::opt::ArgList &TCArgs,
	const char *LinkingOutput) const {
	// The version with multiple outputs is expected to refer to a unbundling job.
	auto &UA = cast<OffloadUnbundlingJobAction>(JA);

	// The unbundling command looks like this:
	// clang-offload-bundler -type=bc
	// -targets=host-triple,openmp-triple1,openmp-triple2
	// -inputs=input_file
	// -outputs=unbundle_file_host,unbundle_file_tgt1,unbundle_file_tgt2"
	// -unbundle

	ArgStringList CmdArgs;

	assert(Inputs.size() == 1 && "Expecting to unbundle a single file!");
	InputInfo Input = Inputs.front();

	// Get the type.
	CmdArgs.push_back(TCArgs.MakeArgString(
	Twine("-type=") + types::getTypeTempSuffix(Input.getType())));

	// Get the targets.
	SmallString<128> Triples;
	Triples += "-targets=";
	auto DepInfo = UA.getDependentActionsInfo();
	for (unsigned I = 0; I < DepInfo.size(); ++I) {
	if (I)
	Triples += ',';

	auto &Dep = DepInfo[I];
	Triples += Action::GetOffloadKindName(Dep.DependentOffloadKind);
	Triples += '-';
	Triples += Dep.DependentToolChain->getTriple().normalize();
	if (Dep.DependentOffloadKind == Action::OFK_HIP &&
	!Dep.DependentBoundArch.empty()) {
	Triples += '-';
	Triples += Dep.DependentBoundArch;
	}
	}

	CmdArgs.push_back(TCArgs.MakeArgString(Triples));

	// Get bundled file command.
	CmdArgs.push_back(
	TCArgs.MakeArgString(Twine("-inputs=") + Input.getFilename()));

	// Get unbundled files command.
	SmallString<128> UB;
	UB += "-outputs=";
	for (unsigned I = 0; I < Outputs.size(); ++I) {
	if (I)
	UB += ',';
	UB += DepInfo[I].DependentToolChain->getInputFilename(Outputs[I]);
	}
	CmdArgs.push_back(TCArgs.MakeArgString(UB));
	CmdArgs.push_back("-unbundle");

	// All the inputs are encoded as commands.
	C.addCommand(std::make_unique<Command>(
	JA, *this, ResponseFileSupport::None(),
	TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
	CmdArgs, None));
	}

	void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
	const InputInfo &Output,
	const InputInfoList &Inputs,
	const ArgList &Args,
	const char *LinkingOutput) const {
	ArgStringList CmdArgs;

	const llvm::Triple &Triple = getToolChain().getEffectiveTriple();

	// Add the "effective" target triple.
	CmdArgs.push_back("-target");
	CmdArgs.push_back(Args.MakeArgString(Triple.getTriple()));

	// Add the output file name.
	assert(Output.isFilename() && "Invalid output.");
	CmdArgs.push_back("-o");
	CmdArgs.push_back(Output.getFilename());

	// Add inputs.
	for (const InputInfo &I : Inputs) {
	assert(I.isFilename() && "Invalid input.");
	CmdArgs.push_back(I.getFilename());
	}

	C.addCommand(std::make_unique<Command>(
	JA, *this, ResponseFileSupport::None(),
	Args.MakeArgString(getToolChain().GetProgramPath(getShortName())),
	CmdArgs, Inputs));
	}
	diff --git a/contrib/llvm-project/libunwind/src/AddressSpace.hpp b/contrib/llvm-project/libunwind/src/AddressSpace.hpp
	index d9d17ddb7bf4..e40c23291f84 100644
	--- a/contrib/llvm-project/libunwind/src/AddressSpace.hpp
	+++ b/contrib/llvm-project/libunwind/src/AddressSpace.hpp
	@@ -1,693 +1,693 @@
	//===------------------------- AddressSpace.hpp ---------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//
	// Abstracts accessing local vs remote address spaces.
	//
	//===----------------------------------------------------------------------===//

	#ifndef __ADDRESSSPACE_HPP__
	#define __ADDRESSSPACE_HPP__

	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#ifndef _LIBUNWIND_USE_DLADDR
	#if !defined(_LIBUNWIND_IS_BAREMETAL) && !defined(_WIN32)
	#define _LIBUNWIND_USE_DLADDR 1
	#else
	#define _LIBUNWIND_USE_DLADDR 0
	#endif
	#endif

	#if _LIBUNWIND_USE_DLADDR
	#include <dlfcn.h>
	#if defined(__ELF__) && defined(_LIBUNWIND_LINK_DL_LIB)
	#pragma comment(lib, "dl")
	#endif
	#endif

	#if defined(_LIBUNWIND_ARM_EHABI)
	struct EHABIIndexEntry {
	uint32_t functionOffset;
	uint32_t data;
	};
	#endif

	#ifdef __APPLE__
	#include <mach-o/getsect.h>
	namespace libunwind {
	bool checkKeyMgrRegisteredFDEs(uintptr_t targetAddr, void *&fde);
	}
	#endif

	#include "libunwind.h"
	#include "config.h"
	#include "dwarf2.h"
	#include "EHHeaderParser.hpp"
	#include "Registers.hpp"

	#ifdef __APPLE__

	struct dyld_unwind_sections
	{
	const struct mach_header* mh;
	const void* dwarf_section;
	uintptr_t dwarf_section_length;
	const void* compact_unwind_section;
	uintptr_t compact_unwind_section_length;
	};
	#if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) \
	&& (__MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)) \
	\|\| defined(__IPHONE_OS_VERSION_MIN_REQUIRED)
	// In 10.7.0 or later, libSystem.dylib implements this function.
	extern "C" bool _dyld_find_unwind_sections(void , dyld_unwind_sections );
	#else
	// In 10.6.x and earlier, we need to implement this functionality. Note
	// that this requires a newer version of libmacho (from cctools) than is
	// present in libSystem on 10.6.x (for getsectiondata).
	static inline bool _dyld_find_unwind_sections(void* addr,
	dyld_unwind_sections* info) {
	// Find mach-o image containing address.
	Dl_info dlinfo;
	if (!dladdr(addr, &dlinfo))
	return false;
	#if __LP64__
	const struct mach_header_64 mh = (const struct mach_header_64 )dlinfo.dli_fbase;
	#else
	const struct mach_header mh = (const struct mach_header )dlinfo.dli_fbase;
	#endif

	// Initialize the return struct
	info->mh = (const struct mach_header *)mh;
	info->dwarf_section = getsectiondata(mh, "__TEXT", "__eh_frame", &info->dwarf_section_length);
	info->compact_unwind_section = getsectiondata(mh, "__TEXT", "__unwind_info", &info->compact_unwind_section_length);

	if (!info->dwarf_section) {
	info->dwarf_section_length = 0;
	}

	if (!info->compact_unwind_section) {
	info->compact_unwind_section_length = 0;
	}

	return true;
	}
	#endif

	#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL)

	// When statically linked on bare-metal, the symbols for the EH table are looked
	// up without going through the dynamic loader.

	// The following linker script may be used to produce the necessary sections and symbols.
	// Unless the --eh-frame-hdr linker option is provided, the section is not generated
	// and does not take space in the output file.
	//
	// .eh_frame :
	// {
	// __eh_frame_start = .;
	// KEEP(*(.eh_frame))
	// __eh_frame_end = .;
	// }
	//
	// .eh_frame_hdr :
	// {
	// KEEP(*(.eh_frame_hdr))
	// }
	//
	// __eh_frame_hdr_start = SIZEOF(.eh_frame_hdr) > 0 ? ADDR(.eh_frame_hdr) : 0;
	// __eh_frame_hdr_end = SIZEOF(.eh_frame_hdr) > 0 ? . : 0;

	extern char __eh_frame_start;
	extern char __eh_frame_end;

	#if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
	extern char __eh_frame_hdr_start;
	extern char __eh_frame_hdr_end;
	#endif

	#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL)

	// When statically linked on bare-metal, the symbols for the EH table are looked
	// up without going through the dynamic loader.
	extern char __exidx_start;
	extern char __exidx_end;

	#elif defined(_LIBUNWIND_ARM_EHABI) \|\| defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)

	// ELF-based systems may use dl_iterate_phdr() to access sections
	// containing unwinding information. The ElfW() macro for pointer-size
	// independent ELF header traversal is not provided by <link.h> on some
	// systems (e.g., FreeBSD). On these systems the data structures are
	// just called Elf_XXX. Define ElfW() locally.
	#ifndef _WIN32
	#include <link.h>
	#else
	#include <windows.h>
	#include <psapi.h>
	#endif
	#if !defined(ElfW)
	#define ElfW(type) Elf_##type
	#endif

	#endif

	namespace libunwind {

	/// Used by findUnwindSections() to return info about needed sections.
	struct UnwindInfoSections {
	#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) \|\| defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) \|\| \
	defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
	// No dso_base for SEH or ARM EHABI.
	uintptr_t dso_base;
	#endif
	#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
	uintptr_t dwarf_section;
	uintptr_t dwarf_section_length;
	#endif
	#if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
	uintptr_t dwarf_index_section;
	uintptr_t dwarf_index_section_length;
	#endif
	#if defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
	uintptr_t compact_unwind_section;
	uintptr_t compact_unwind_section_length;
	#endif
	#if defined(_LIBUNWIND_ARM_EHABI)
	uintptr_t arm_section;
	uintptr_t arm_section_length;
	#endif
	};


	/// LocalAddressSpace is used as a template parameter to UnwindCursor when
	/// unwinding a thread in the same process. The wrappers compile away,
	/// making local unwinds fast.
	class _LIBUNWIND_HIDDEN LocalAddressSpace {
	public:
	typedef uintptr_t pint_t;
	typedef intptr_t sint_t;
	uint8_t get8(pint_t addr) {
	uint8_t val;
	memcpy(&val, (void *)addr, sizeof(val));
	return val;
	}
	uint16_t get16(pint_t addr) {
	uint16_t val;
	memcpy(&val, (void *)addr, sizeof(val));
	return val;
	}
	uint32_t get32(pint_t addr) {
	uint32_t val;
	memcpy(&val, (void *)addr, sizeof(val));
	return val;
	}
	uint64_t get64(pint_t addr) {
	uint64_t val;
	memcpy(&val, (void *)addr, sizeof(val));
	return val;
	}
	double getDouble(pint_t addr) {
	double val;
	memcpy(&val, (void *)addr, sizeof(val));
	return val;
	}
	v128 getVector(pint_t addr) {
	v128 val;
	memcpy(&val, (void *)addr, sizeof(val));
	return val;
	}
	uintptr_t getP(pint_t addr);
	uint64_t getRegister(pint_t addr);
	static uint64_t getULEB128(pint_t &addr, pint_t end);
	static int64_t getSLEB128(pint_t &addr, pint_t end);

	pint_t getEncodedP(pint_t &addr, pint_t end, uint8_t encoding,
	pint_t datarelBase = 0);
	bool findFunctionName(pint_t addr, char *buf, size_t bufLen,
	unw_word_t *offset);
	bool findUnwindSections(pint_t targetAddr, UnwindInfoSections &info);
	bool findOtherFDE(pint_t targetAddr, pint_t &fde);

	static LocalAddressSpace sThisAddressSpace;
	};

	inline uintptr_t LocalAddressSpace::getP(pint_t addr) {
	#if __SIZEOF_POINTER__ == 8
	return get64(addr);
	#else
	return get32(addr);
	#endif
	}

	inline uint64_t LocalAddressSpace::getRegister(pint_t addr) {
	#if __SIZEOF_POINTER__ == 8 \|\| defined(__mips64)
	return get64(addr);
	#else
	return get32(addr);
	#endif
	}

	/// Read a ULEB128 into a 64-bit word.
	inline uint64_t LocalAddressSpace::getULEB128(pint_t &addr, pint_t end) {
	const uint8_t p = (uint8_t )addr;
	const uint8_t pend = (uint8_t )end;
	uint64_t result = 0;
	int bit = 0;
	do {
	uint64_t b;

	if (p == pend)
	_LIBUNWIND_ABORT("truncated uleb128 expression");

	b = *p & 0x7f;

	if (bit >= 64 \|\| b << bit >> bit != b) {
	_LIBUNWIND_ABORT("malformed uleb128 expression");
	} else {
	result \|= b << bit;
	bit += 7;
	}
	} while (*p++ >= 0x80);
	addr = (pint_t) p;
	return result;
	}

	/// Read a SLEB128 into a 64-bit word.
	inline int64_t LocalAddressSpace::getSLEB128(pint_t &addr, pint_t end) {
	const uint8_t p = (uint8_t )addr;
	const uint8_t pend = (uint8_t )end;
	int64_t result = 0;
	int bit = 0;
	uint8_t byte;
	do {
	if (p == pend)
	_LIBUNWIND_ABORT("truncated sleb128 expression");
	byte = *p++;
	result \|= ((byte & 0x7f) << bit);
	bit += 7;
	} while (byte & 0x80);
	// sign extend negative numbers
	if ((byte & 0x40) != 0)
	result \|= (-1ULL) << bit;
	addr = (pint_t) p;
	return result;
	}

	inline LocalAddressSpace::pint_t
	LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding,
	pint_t datarelBase) {
	pint_t startAddr = addr;
	const uint8_t p = (uint8_t )addr;
	pint_t result;

	// first get value
	switch (encoding & 0x0F) {
	case DW_EH_PE_ptr:
	result = getP(addr);
	p += sizeof(pint_t);
	addr = (pint_t) p;
	break;
	case DW_EH_PE_uleb128:
	result = (pint_t)getULEB128(addr, end);
	break;
	case DW_EH_PE_udata2:
	result = get16(addr);
	p += 2;
	addr = (pint_t) p;
	break;
	case DW_EH_PE_udata4:
	result = get32(addr);
	p += 4;
	addr = (pint_t) p;
	break;
	case DW_EH_PE_udata8:
	result = (pint_t)get64(addr);
	p += 8;
	addr = (pint_t) p;
	break;
	case DW_EH_PE_sleb128:
	result = (pint_t)getSLEB128(addr, end);
	break;
	case DW_EH_PE_sdata2:
	// Sign extend from signed 16-bit value.
	result = (pint_t)(int16_t)get16(addr);
	p += 2;
	addr = (pint_t) p;
	break;
	case DW_EH_PE_sdata4:
	// Sign extend from signed 32-bit value.
	result = (pint_t)(int32_t)get32(addr);
	p += 4;
	addr = (pint_t) p;
	break;
	case DW_EH_PE_sdata8:
	result = (pint_t)get64(addr);
	p += 8;
	addr = (pint_t) p;
	break;
	default:
	_LIBUNWIND_ABORT("unknown pointer encoding");
	}

	// then add relative offset
	switch (encoding & 0x70) {
	case DW_EH_PE_absptr:
	// do nothing
	break;
	case DW_EH_PE_pcrel:
	result += startAddr;
	break;
	case DW_EH_PE_textrel:
	_LIBUNWIND_ABORT("DW_EH_PE_textrel pointer encoding not supported");
	break;
	case DW_EH_PE_datarel:
	// DW_EH_PE_datarel is only valid in a few places, so the parameter has a
	// default value of 0, and we abort in the event that someone calls this
	// function with a datarelBase of 0 and DW_EH_PE_datarel encoding.
	if (datarelBase == 0)
	_LIBUNWIND_ABORT("DW_EH_PE_datarel is invalid with a datarelBase of 0");
	result += datarelBase;
	break;
	case DW_EH_PE_funcrel:
	_LIBUNWIND_ABORT("DW_EH_PE_funcrel pointer encoding not supported");
	break;
	case DW_EH_PE_aligned:
	_LIBUNWIND_ABORT("DW_EH_PE_aligned pointer encoding not supported");
	break;
	default:
	_LIBUNWIND_ABORT("unknown pointer encoding");
	break;
	}

	if (encoding & DW_EH_PE_indirect)
	result = getP(result);

	return result;
	}

	#ifdef __APPLE__
	#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL)
	#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL)
	#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32)
	#elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32)
	#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__)
	// Code inside findUnwindSections handles all these cases.
	//
	// Although the above ifdef chain is ugly, there doesn't seem to be a cleaner
	// way to handle it. The generalized boolean expression is:
	//
	// A OR (B AND C) OR (D AND C) OR (B AND E) OR (F AND E) OR (D AND G)
	//
	// Running it through various boolean expression simplifiers gives expressions
	// that don't help at all.
	#elif defined(_LIBUNWIND_ARM_EHABI) \|\| defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)

	#if !defined(Elf_Half)
	typedef ElfW(Half) Elf_Half;
	#endif
	#if !defined(Elf_Phdr)
	typedef ElfW(Phdr) Elf_Phdr;
	#endif
	#if !defined(Elf_Addr)
	typedef ElfW(Addr) Elf_Addr;
	#endif

	static Elf_Addr calculateImageBase(struct dl_phdr_info *pinfo) {
	Elf_Addr image_base = pinfo->dlpi_addr;
	#if defined(__ANDROID__) && __ANDROID_API__ < 18
	if (image_base == 0) {
	// Normally, an image base of 0 indicates a non-PIE executable. On
	// versions of Android prior to API 18, the dynamic linker reported a
	// dlpi_addr of 0 for PIE executables. Compute the true image base
	// using the PT_PHDR segment.
	// See https://github.com/android/ndk/issues/505.
	for (Elf_Half i = 0; i < pinfo->dlpi_phnum; i++) {
	const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i];
	if (phdr->p_type == PT_PHDR) {
	image_base = reinterpret_cast<Elf_Addr>(pinfo->dlpi_phdr) -
	phdr->p_vaddr;
	break;
	}
	}
	}
	#endif
	return image_base;
	}

	struct _LIBUNWIND_HIDDEN dl_iterate_cb_data {
	LocalAddressSpace *addressSpace;
	UnwindInfoSections *sects;
	uintptr_t targetAddr;
	};

	#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
	#if !defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
	#error "_LIBUNWIND_SUPPORT_DWARF_UNWIND requires _LIBUNWIND_SUPPORT_DWARF_INDEX on this platform."
	#endif

	#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
	#include "FrameHeaderCache.hpp"

	// There should be just one of these per process.
	static FrameHeaderCache ProcessFrameHeaderCache;
	-#endif // _LIBUNWIND_USE_FRAME_HEADER_CACHE
	+#endif

	static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base,
	dl_iterate_cb_data *cbdata) {
	if (phdr->p_type == PT_LOAD) {
	uintptr_t begin = image_base + phdr->p_vaddr;
	uintptr_t end = begin + phdr->p_memsz;
	if (cbdata->targetAddr >= begin && cbdata->targetAddr < end) {
	cbdata->sects->dso_base = begin;
	cbdata->sects->dwarf_section_length = phdr->p_memsz;
	return true;
	}
	}
	return false;
	}

	int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
	void *data) {
	auto cbdata = static_cast<dl_iterate_cb_data *>(data);
	if (pinfo->dlpi_phnum == 0 \|\| cbdata->targetAddr < pinfo->dlpi_addr)
	return 0;
	#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
	if (ProcessFrameHeaderCache.find(pinfo, pinfo_size, data))
	return 1;
	-#endif // _LIBUNWIND_USE_FRAME_HEADER_CACHE
	+#endif

	Elf_Addr image_base = calculateImageBase(pinfo);
	bool found_obj = false;
	bool found_hdr = false;

	// Third phdr is usually the executable phdr.
	if (pinfo->dlpi_phnum > 2)
	found_obj = checkAddrInSegment(&pinfo->dlpi_phdr[2], image_base, cbdata);

	// PT_GNU_EH_FRAME is usually near the end. Iterate backward. We already know
	// that there is one or more phdrs.
	for (Elf_Half i = pinfo->dlpi_phnum; i > 0; i--) {
	const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i - 1];
	if (!found_hdr && phdr->p_type == PT_GNU_EH_FRAME) {
	EHHeaderParser<LocalAddressSpace>::EHHeaderInfo hdrInfo;
	uintptr_t eh_frame_hdr_start = image_base + phdr->p_vaddr;
	cbdata->sects->dwarf_index_section = eh_frame_hdr_start;
	cbdata->sects->dwarf_index_section_length = phdr->p_memsz;
	found_hdr = EHHeaderParser<LocalAddressSpace>::decodeEHHdr(
	*cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz,
	hdrInfo);
	if (found_hdr)
	cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr;
	} else if (!found_obj) {
	found_obj = checkAddrInSegment(phdr, image_base, cbdata);
	}
	if (found_obj && found_hdr) {
	#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
	ProcessFrameHeaderCache.add(cbdata->sects);
	-#endif // _LIBUNWIND_USE_FRAME_HEADER_CACHE
	+#endif
	return 1;
	}
	}
	cbdata->sects->dwarf_section_length = 0;
	return 0;
	}

	#else // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND)
	// Given all the #ifdef's above, the code here is for
	// defined(LIBUNWIND_ARM_EHABI)

	int findUnwindSectionsByPhdr(struct dl_phdr_info pinfo, size_t, void data) {
	auto cbdata = static_cast<dl_iterate_cb_data >(data);
	bool found_obj = false;
	bool found_hdr = false;

	assert(cbdata);
	assert(cbdata->sects);

	if (cbdata->targetAddr < pinfo->dlpi_addr)
	return 0;

	Elf_Addr image_base = calculateImageBase(pinfo);

	for (Elf_Half i = 0; i < pinfo->dlpi_phnum; i++) {
	const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i];
	if (phdr->p_type == PT_LOAD) {
	uintptr_t begin = image_base + phdr->p_vaddr;
	uintptr_t end = begin + phdr->p_memsz;
	if (cbdata->targetAddr >= begin && cbdata->targetAddr < end)
	found_obj = true;
	} else if (phdr->p_type == PT_ARM_EXIDX) {
	uintptr_t exidx_start = image_base + phdr->p_vaddr;
	cbdata->sects->arm_section = exidx_start;
	cbdata->sects->arm_section_length = phdr->p_memsz;
	found_hdr = true;
	}
	}
	return found_obj && found_hdr;
	}
	#endif // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND)
	#endif // defined(_LIBUNWIND_ARM_EHABI) \|\| defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)


	inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr,
	UnwindInfoSections &info) {
	#ifdef __APPLE__
	dyld_unwind_sections dyldInfo;
	if (_dyld_find_unwind_sections((void *)targetAddr, &dyldInfo)) {
	info.dso_base = (uintptr_t)dyldInfo.mh;
	#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
	info.dwarf_section = (uintptr_t)dyldInfo.dwarf_section;
	info.dwarf_section_length = dyldInfo.dwarf_section_length;
	#endif
	info.compact_unwind_section = (uintptr_t)dyldInfo.compact_unwind_section;
	info.compact_unwind_section_length = dyldInfo.compact_unwind_section_length;
	return true;
	}
	#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL)
	// Bare metal is statically linked, so no need to ask the dynamic loader
	info.dwarf_section_length = (uintptr_t)(&__eh_frame_end - &__eh_frame_start);
	info.dwarf_section = (uintptr_t)(&__eh_frame_start);
	_LIBUNWIND_TRACE_UNWINDING("findUnwindSections: section %p length %p",
	(void )info.dwarf_section, (void )info.dwarf_section_length);
	#if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
	info.dwarf_index_section = (uintptr_t)(&__eh_frame_hdr_start);
	info.dwarf_index_section_length = (uintptr_t)(&__eh_frame_hdr_end - &__eh_frame_hdr_start);
	_LIBUNWIND_TRACE_UNWINDING("findUnwindSections: index section %p length %p",
	(void )info.dwarf_index_section, (void )info.dwarf_index_section_length);
	#endif
	if (info.dwarf_section_length)
	return true;
	#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL)
	// Bare metal is statically linked, so no need to ask the dynamic loader
	info.arm_section = (uintptr_t)(&__exidx_start);
	info.arm_section_length = (uintptr_t)(&__exidx_end - &__exidx_start);
	_LIBUNWIND_TRACE_UNWINDING("findUnwindSections: section %p length %p",
	(void )info.arm_section, (void )info.arm_section_length);
	if (info.arm_section && info.arm_section_length)
	return true;
	#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32)
	HMODULE mods[1024];
	HANDLE process = GetCurrentProcess();
	DWORD needed;

	if (!EnumProcessModules(process, mods, sizeof(mods), &needed)) {
	DWORD err = GetLastError();
	_LIBUNWIND_TRACE_UNWINDING("findUnwindSections: EnumProcessModules failed, "
	"returned error %d", (int)err);
	return false;
	}

	for (unsigned i = 0; i < (needed / sizeof(HMODULE)); i++) {
	PIMAGE_DOS_HEADER pidh = (PIMAGE_DOS_HEADER)mods[i];
	PIMAGE_NT_HEADERS pinh = (PIMAGE_NT_HEADERS)((BYTE *)pidh + pidh->e_lfanew);
	PIMAGE_FILE_HEADER pifh = (PIMAGE_FILE_HEADER)&pinh->FileHeader;
	PIMAGE_SECTION_HEADER pish = IMAGE_FIRST_SECTION(pinh);
	bool found_obj = false;
	bool found_hdr = false;

	info.dso_base = (uintptr_t)mods[i];
	for (unsigned j = 0; j < pifh->NumberOfSections; j++, pish++) {
	uintptr_t begin = pish->VirtualAddress + (uintptr_t)mods[i];
	uintptr_t end = begin + pish->Misc.VirtualSize;
	if (!strncmp((const char *)pish->Name, ".text",
	IMAGE_SIZEOF_SHORT_NAME)) {
	if (targetAddr >= begin && targetAddr < end)
	found_obj = true;
	} else if (!strncmp((const char *)pish->Name, ".eh_frame",
	IMAGE_SIZEOF_SHORT_NAME)) {
	info.dwarf_section = begin;
	info.dwarf_section_length = pish->Misc.VirtualSize;
	found_hdr = true;
	}
	if (found_obj && found_hdr)
	return true;
	}
	}
	return false;
	#elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32)
	// Don't even bother, since Windows has functions that do all this stuff
	// for us.
	(void)targetAddr;
	(void)info;
	return true;
	#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__)
	// For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After
	// API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster.
	int length = 0;
	info.arm_section =
	(uintptr_t)dl_unwind_find_exidx((_Unwind_Ptr)targetAddr, &length);
	info.arm_section_length = (uintptr_t)length * sizeof(EHABIIndexEntry);
	if (info.arm_section && info.arm_section_length)
	return true;
	#elif defined(_LIBUNWIND_ARM_EHABI) \|\| defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
	dl_iterate_cb_data cb_data = {this, &info, targetAddr};
	int found = dl_iterate_phdr(findUnwindSectionsByPhdr, &cb_data);
	return static_cast<bool>(found);
	#endif

	return false;
	}


	inline bool LocalAddressSpace::findOtherFDE(pint_t targetAddr, pint_t &fde) {
	#ifdef __APPLE__
	return checkKeyMgrRegisteredFDEs(targetAddr, ((void*)&fde));
	#else
	// TO DO: if OS has way to dynamically register FDEs, check that.
	(void)targetAddr;
	(void)fde;
	return false;
	#endif
	}

	inline bool LocalAddressSpace::findFunctionName(pint_t addr, char *buf,
	size_t bufLen,
	unw_word_t *offset) {
	#if _LIBUNWIND_USE_DLADDR
	Dl_info dyldInfo;
	if (dladdr((void *)addr, &dyldInfo)) {
	if (dyldInfo.dli_sname != NULL) {
	snprintf(buf, bufLen, "%s", dyldInfo.dli_sname);
	*offset = (addr - (pint_t) dyldInfo.dli_saddr);
	return true;
	}
	}
	#else
	(void)addr;
	(void)buf;
	(void)bufLen;
	(void)offset;
	#endif
	return false;
	}

	} // namespace libunwind

	#endif // __ADDRESSSPACE_HPP__
	diff --git a/contrib/llvm-project/lld/docs/ReleaseNotes.rst b/contrib/llvm-project/lld/docs/ReleaseNotes.rst
	index fe3de8306cd8..513ad37e278e 100644
	--- a/contrib/llvm-project/lld/docs/ReleaseNotes.rst
	+++ b/contrib/llvm-project/lld/docs/ReleaseNotes.rst
	@@ -1,57 +1,66 @@
	========================
	lld 11.0.0 Release Notes
	========================

	.. contents::
	:local:

	.. warning::
	These are in-progress notes for the upcoming LLVM 11.0.0 release.
	Release notes for previous releases can be found on
	`the Download Page <https://releases.llvm.org/download.html>`_.

	Introduction
	============

	This document contains the release notes for the lld linker, release 11.0.0.
	Here we describe the status of lld, including major improvements
	from the previous release. All lld releases may be downloaded
	from the `LLVM releases web site <https://llvm.org/releases/>`_.

	Non-comprehensive list of changes in this release
	=================================================

	ELF Improvements
	----------------

	* New ``--time-trace`` option records a time trace file that can be viewed in
	chrome://tracing. The file can be specified with ``--time-trace-file``.
	Trace granularity can be specified with ``--time-trace-granularity``.
	(`D71060 <https://reviews.llvm.org/D71060>`_)
	+* For ARM architectures the default max page size was increased to 64k.
	+ This increases compatibility with systems where a non standard page
	+ size was configured. This also is inline with GNU ld defaults.
	+ (`D77330 <https://reviews.llvm.org/D77330>`_)
	* ...

	Breaking changes
	----------------

	* One-dash form of some long option (``--thinlto-``, ``--lto-``, ``--shuffle-sections=``)
	are no longer supported.
	* ``--export-dynamic-symbol`` no longer implies ``-u``.

	COFF Improvements
	-----------------

	-* ...
	+* Fixed exporting symbols whose names contain a period (``.``), which was
	+ a regression in lld 7.

	MinGW Improvements
	------------------

	-* ...
	+* Implemented new options for disabling auto import and runtime pseudo
	+ relocations (``--disable-auto-import`` and
	+ ``--disable-runtime-pseudo-reloc``), the ``--no-seh`` flag and options
	+ for selecting file and section alignment (``--file-alignment`` and
	+ ``--section-alignment``).

	MachO Improvements
	------------------

	* Item 1.

	WebAssembly Improvements
	------------------------

	diff --git a/contrib/llvm-project/lld/docs/conf.py b/contrib/llvm-project/lld/docs/conf.py
	index 7d4fc0c5ad75..ee93c01f7f32 100644
	--- a/contrib/llvm-project/lld/docs/conf.py
	+++ b/contrib/llvm-project/lld/docs/conf.py
	@@ -1,255 +1,255 @@
	# -- coding: utf-8 --
	#
	# lld documentation build configuration file.
	#
	# This file is execfile()d with the current directory set to its containing dir.
	#
	# Note that not all possible configuration values are present in this
	# autogenerated file.
	#
	# All configuration values have a default; values that are commented out
	# serve to show the default.

	import sys, os
	from datetime import date

	# If extensions (or modules to document with autodoc) are in another directory,
	# add these directories to sys.path here. If the directory is relative to the
	# documentation root, use os.path.abspath to make it absolute, like shown here.
	#sys.path.insert(0, os.path.abspath('.'))

	# -- General configuration -----------------------------------------------------

	# If your documentation needs a minimal Sphinx version, state it here.
	#needs_sphinx = '1.0'

	# Add any Sphinx extension module names here, as strings. They can be extensions
	# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
	extensions = ['sphinx.ext.intersphinx', 'sphinx.ext.todo']

	# Add any paths that contain templates here, relative to this directory.
	templates_path = ['_templates']

	# The suffix of source filenames.
	source_suffix = '.rst'

	# The encoding of source files.
	#source_encoding = 'utf-8-sig'

	# The master toctree document.
	master_doc = 'index'

	# General information about the project.
	project = u'lld'
	copyright = u'2011-%d, LLVM Project' % date.today().year

	# The version info for the project you're documenting, acts as replacement for
	# \|version\| and \|release\|, also used in various other places throughout the
	# built documents.
	#
	# The short version.
	version = '11'
	# The full version, including alpha/beta/rc tags.
	release = '11'

	# The language for content autogenerated by Sphinx. Refer to documentation
	# for a list of supported languages.
	#language = None

	# There are two options for replacing \|today\|: either, you set today to some
	# non-false value, then it is used:
	#today = ''
	# Else, today_fmt is used as the format for a strftime call.
	today_fmt = '%Y-%m-%d'

	# List of patterns, relative to source directory, that match files and
	# directories to ignore when looking for source files.
	exclude_patterns = ['_build']

	# The reST default role (used for this markup: `text`) to use for all documents.
	#default_role = None

	# If true, '()' will be appended to :func: etc. cross-reference text.
	#add_function_parentheses = True

	# If true, the current module name will be prepended to all description
	# unit titles (such as .. function::).
	#add_module_names = True

	# If true, sectionauthor and moduleauthor directives will be shown in the
	# output. They are ignored by default.
	show_authors = True

	# The name of the Pygments (syntax highlighting) style to use.
	pygments_style = 'friendly'

	# A list of ignored prefixes for module index sorting.
	#modindex_common_prefix = []


	# -- Options for HTML output ---------------------------------------------------

	# The theme to use for HTML and HTML Help pages. See the documentation for
	# a list of builtin themes.
	html_theme = 'llvm-theme'

	# Theme options are theme-specific and customize the look and feel of a theme
	# further. For a list of options available for each theme, see the
	# documentation.
	#html_theme_options = {}

	# Add any paths that contain custom themes here, relative to this directory.
	html_theme_path = ["."]

	# The name for this set of Sphinx documents. If None, it defaults to
	# "<project> v<release> documentation".
	#html_title = None

	# A shorter title for the navigation bar. Default is the same as html_title.
	#html_short_title = None

	# The name of an image file (relative to this directory) to place at the top
	# of the sidebar.
	#html_logo = None

	# If given, this must be the name of an image file (path relative to the
	# configuration directory) that is the favicon of the docs. Modern browsers use
	# this as icon for tabs, windows and bookmarks. It should be a Windows-style
	# icon file (.ico), which is 16x16 or 32x32 pixels large. Default: None. The
	# image file will be copied to the _static directory of the output HTML, but
	# only if the file does not already exist there.
	html_favicon = '_static/favicon.ico'

	# Add any paths that contain custom static files (such as style sheets) here,
	# relative to this directory. They are copied after the builtin static files,
	# so a file named "default.css" will overwrite the builtin "default.css".
	html_static_path = ['_static']

	# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
	# using the given strftime format.
	html_last_updated_fmt = '%Y-%m-%d'

	# If true, SmartyPants will be used to convert quotes and dashes to
	# typographically correct entities.
	#html_use_smartypants = True

	# Custom sidebar templates, maps document names to template names.
	-html_sidebars = {'index': 'indexsidebar.html'}
	+html_sidebars = {'index': ['indexsidebar.html']}

	# Additional templates that should be rendered to pages, maps page names to
	# template names.
	# html_additional_pages = {'index': 'index.html'}

	# If false, no module index is generated.
	#html_domain_indices = True

	# If false, no index is generated.
	#html_use_index = True

	# If true, the index is split into individual pages for each letter.
	#html_split_index = False

	# If true, links to the reST sources are added to the pages.
	html_show_sourcelink = True

	# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
	#html_show_sphinx = True

	# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
	#html_show_copyright = True

	# If true, an OpenSearch description file will be output, and all pages will
	# contain a <link> tag referring to it. The value of this option must be the
	# base URL from which the finished HTML is served.
	#html_use_opensearch = ''

	# This is the file name suffix for HTML files (e.g. ".xhtml").
	#html_file_suffix = None

	# Output file base name for HTML help builder.
	htmlhelp_basename = 'llddoc'


	# -- Options for LaTeX output --------------------------------------------------

	latex_elements = {
	# The paper size ('letterpaper' or 'a4paper').
	#'papersize': 'letterpaper',

	# The font size ('10pt', '11pt' or '12pt').
	#'pointsize': '10pt',

	# Additional stuff for the LaTeX preamble.
	#'preamble': '',
	}

	# Grouping the document tree into LaTeX files. List of tuples
	# (source start file, target name, title, author, documentclass [howto/manual]).
	latex_documents = [
	('contents', 'lld.tex', u'lld Documentation',
	u'LLVM project', 'manual'),
	]

	# The name of an image file (relative to this directory) to place at the top of
	# the title page.
	#latex_logo = None

	# For "manual" documents, if this is true, then toplevel headings are parts,
	# not chapters.
	#latex_use_parts = False

	# If true, show page references after internal links.
	#latex_show_pagerefs = False

	# If true, show URL addresses after external links.
	#latex_show_urls = False

	# Documents to append as an appendix to all manuals.
	#latex_appendices = []

	# If false, no module index is generated.
	#latex_domain_indices = True


	# -- Options for manual page output --------------------------------------------

	# One entry per manual page. List of tuples
	# (source start file, name, description, authors, manual section).
	man_pages = [
	('contents', 'lld', u'lld Documentation',
	[u'LLVM project'], 1)
	]

	# If true, show URL addresses after external links.
	#man_show_urls = False


	# -- Options for Texinfo output ------------------------------------------------

	# Grouping the document tree into Texinfo files. List of tuples
	# (source start file, target name, title, author,
	# dir menu entry, description, category)
	texinfo_documents = [
	('contents', 'lld', u'lld Documentation',
	u'LLVM project', 'lld', 'One line description of project.',
	'Miscellaneous'),
	]

	# Documents to append as an appendix to all manuals.
	#texinfo_appendices = []

	# If false, no module index is generated.
	#texinfo_domain_indices = True

	# How to display URL addresses: 'footnote', 'no', or 'inline'.
	#texinfo_show_urls = 'footnote'


	# FIXME: Define intersphinx configuration.
	intersphinx_mapping = {}


	# -- Options for extensions ----------------------------------------------------

	# Enable this if you want TODOs to show up in the generated documentation.
	todo_include_todos = True
	diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h b/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h
	index e3cea0ae64cf..70da5f76e766 100644
	--- a/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h
	+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h
	@@ -1,643 +1,650 @@
	//===- MCDwarf.h - Machine Code Dwarf support -------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the declaration of the MCDwarfFile to support the dwarf
	// .file directive and the .loc directive.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_MC_MCDWARF_H
	#define LLVM_MC_MCDWARF_H

	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/MD5.h"
	#include <cassert>
	#include <cstdint>
	#include <string>
	#include <utility>
	#include <vector>

	namespace llvm {

	template <typename T> class ArrayRef;
	class MCAsmBackend;
	class MCContext;
	class MCDwarfLineStr;
	class MCObjectStreamer;
	class MCStreamer;
	class MCSymbol;
	class raw_ostream;
	class SMLoc;
	class SourceMgr;

	namespace mcdwarf {
	// Emit the common part of the DWARF 5 range/locations list tables header.
	MCSymbol *emitListsTableHeaderStart(MCStreamer &S);
	} // namespace mcdwarf

	/// Instances of this class represent the name of the dwarf .file directive and
	/// its associated dwarf file number in the MC file. MCDwarfFile's are created
	/// and uniqued by the MCContext class. In Dwarf 4 file numbers start from 1;
	/// i.e. the entry with file number 1 is the first element in the vector of
	/// DwarfFiles and there is no MCDwarfFile with file number 0. In Dwarf 5 file
	/// numbers start from 0, with the MCDwarfFile with file number 0 being the
	/// primary source file, and file numbers correspond to their index in the
	/// vector.
	struct MCDwarfFile {
	// The base name of the file without its directory path.
	std::string Name;

	// The index into the list of directory names for this file name.
	unsigned DirIndex = 0;

	/// The MD5 checksum, if there is one. Non-owning pointer to data allocated
	/// in MCContext.
	Optional<MD5::MD5Result> Checksum;

	/// The source code of the file. Non-owning reference to data allocated in
	/// MCContext.
	Optional<StringRef> Source;
	};

	/// Instances of this class represent the information from a
	/// dwarf .loc directive.
	class MCDwarfLoc {
	uint32_t FileNum;
	uint32_t Line;
	uint16_t Column;
	// Flags (see #define's below)
	uint8_t Flags;
	uint8_t Isa;
	uint32_t Discriminator;

	// Flag that indicates the initial value of the is_stmt_start flag.
	#define DWARF2_LINE_DEFAULT_IS_STMT 1

	#define DWARF2_FLAG_IS_STMT (1 << 0)
	#define DWARF2_FLAG_BASIC_BLOCK (1 << 1)
	#define DWARF2_FLAG_PROLOGUE_END (1 << 2)
	#define DWARF2_FLAG_EPILOGUE_BEGIN (1 << 3)

	private: // MCContext manages these
	friend class MCContext;
	friend class MCDwarfLineEntry;

	MCDwarfLoc(unsigned fileNum, unsigned line, unsigned column, unsigned flags,
	unsigned isa, unsigned discriminator)
	: FileNum(fileNum), Line(line), Column(column), Flags(flags), Isa(isa),
	Discriminator(discriminator) {}

	// Allow the default copy constructor and assignment operator to be used
	// for an MCDwarfLoc object.

	public:
	/// Get the FileNum of this MCDwarfLoc.
	unsigned getFileNum() const { return FileNum; }

	/// Get the Line of this MCDwarfLoc.
	unsigned getLine() const { return Line; }

	/// Get the Column of this MCDwarfLoc.
	unsigned getColumn() const { return Column; }

	/// Get the Flags of this MCDwarfLoc.
	unsigned getFlags() const { return Flags; }

	/// Get the Isa of this MCDwarfLoc.
	unsigned getIsa() const { return Isa; }

	/// Get the Discriminator of this MCDwarfLoc.
	unsigned getDiscriminator() const { return Discriminator; }

	/// Set the FileNum of this MCDwarfLoc.
	void setFileNum(unsigned fileNum) { FileNum = fileNum; }

	/// Set the Line of this MCDwarfLoc.
	void setLine(unsigned line) { Line = line; }

	/// Set the Column of this MCDwarfLoc.
	void setColumn(unsigned column) {
	assert(column <= UINT16_MAX);
	Column = column;
	}

	/// Set the Flags of this MCDwarfLoc.
	void setFlags(unsigned flags) {
	assert(flags <= UINT8_MAX);
	Flags = flags;
	}

	/// Set the Isa of this MCDwarfLoc.
	void setIsa(unsigned isa) {
	assert(isa <= UINT8_MAX);
	Isa = isa;
	}

	/// Set the Discriminator of this MCDwarfLoc.
	void setDiscriminator(unsigned discriminator) {
	Discriminator = discriminator;
	}
	};

	/// Instances of this class represent the line information for
	/// the dwarf line table entries. Which is created after a machine
	/// instruction is assembled and uses an address from a temporary label
	/// created at the current address in the current section and the info from
	/// the last .loc directive seen as stored in the context.
	class MCDwarfLineEntry : public MCDwarfLoc {
	MCSymbol *Label;

	private:
	// Allow the default copy constructor and assignment operator to be used
	// for an MCDwarfLineEntry object.

	public:
	// Constructor to create an MCDwarfLineEntry given a symbol and the dwarf loc.
	MCDwarfLineEntry(MCSymbol *label, const MCDwarfLoc loc)
	: MCDwarfLoc(loc), Label(label) {}

	MCSymbol *getLabel() const { return Label; }

	// This is called when an instruction is assembled into the specified
	// section and if there is information from the last .loc directive that
	// has yet to have a line entry made for it is made.
	static void Make(MCObjectStreamer MCOS, MCSection Section);
	};

	/// Instances of this class represent the line information for a compile
	/// unit where machine instructions have been assembled after seeing .loc
	/// directives. This is the information used to build the dwarf line
	/// table for a section.
	class MCLineSection {
	public:
	// Add an entry to this MCLineSection's line entries.
	void addLineEntry(const MCDwarfLineEntry &LineEntry, MCSection *Sec) {
	MCLineDivisions[Sec].push_back(LineEntry);
	}

	using MCDwarfLineEntryCollection = std::vector<MCDwarfLineEntry>;
	using iterator = MCDwarfLineEntryCollection::iterator;
	using const_iterator = MCDwarfLineEntryCollection::const_iterator;
	using MCLineDivisionMap = MapVector<MCSection *, MCDwarfLineEntryCollection>;

	private:
	// A collection of MCDwarfLineEntry for each section.
	MCLineDivisionMap MCLineDivisions;

	public:
	// Returns the collection of MCDwarfLineEntry for a given Compile Unit ID.
	const MCLineDivisionMap &getMCLineEntries() const {
	return MCLineDivisions;
	}
	};

	struct MCDwarfLineTableParams {
	/// First special line opcode - leave room for the standard opcodes.
	/// Note: If you want to change this, you'll have to update the
	/// "StandardOpcodeLengths" table that is emitted in
	/// \c Emit().
	uint8_t DWARF2LineOpcodeBase = 13;
	/// Minimum line offset in a special line info. opcode. The value
	/// -5 was chosen to give a reasonable range of values.
	int8_t DWARF2LineBase = -5;
	/// Range of line offsets in a special line info. opcode.
	uint8_t DWARF2LineRange = 14;
	};

	struct MCDwarfLineTableHeader {
	MCSymbol *Label = nullptr;
	SmallVector<std::string, 3> MCDwarfDirs;
	SmallVector<MCDwarfFile, 3> MCDwarfFiles;
	StringMap<unsigned> SourceIdMap;
	std::string CompilationDir;
	MCDwarfFile RootFile;
	bool HasSource = false;
	private:
	bool HasAllMD5 = true;
	bool HasAnyMD5 = false;

	public:
	MCDwarfLineTableHeader() = default;

	Expected<unsigned> tryGetFile(StringRef &Directory, StringRef &FileName,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source,
	uint16_t DwarfVersion,
	unsigned FileNumber = 0);
	std::pair<MCSymbol , MCSymbol >
	Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
	Optional<MCDwarfLineStr> &LineStr) const;
	std::pair<MCSymbol , MCSymbol >
	Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
	ArrayRef<char> SpecialOpcodeLengths,
	Optional<MCDwarfLineStr> &LineStr) const;
	void resetMD5Usage() {
	HasAllMD5 = true;
	HasAnyMD5 = false;
	}
	void trackMD5Usage(bool MD5Used) {
	HasAllMD5 &= MD5Used;
	HasAnyMD5 \|= MD5Used;
	}
	bool isMD5UsageConsistent() const {
	return MCDwarfFiles.empty() \|\| (HasAllMD5 == HasAnyMD5);
	}

	void setRootFile(StringRef Directory, StringRef FileName,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source) {
	CompilationDir = std::string(Directory);
	RootFile.Name = std::string(FileName);
	RootFile.DirIndex = 0;
	RootFile.Checksum = Checksum;
	RootFile.Source = Source;
	trackMD5Usage(Checksum.hasValue());
	HasSource = Source.hasValue();
	}

	void resetFileTable() {
	MCDwarfDirs.clear();
	MCDwarfFiles.clear();
	RootFile.Name.clear();
	resetMD5Usage();
	HasSource = false;
	}

	private:
	void emitV2FileDirTables(MCStreamer *MCOS) const;
	void emitV5FileDirTables(MCStreamer *MCOS, Optional<MCDwarfLineStr> &LineStr) const;
	};

	class MCDwarfDwoLineTable {
	MCDwarfLineTableHeader Header;
	bool HasSplitLineTable = false;

	public:
	void maybeSetRootFile(StringRef Directory, StringRef FileName,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source) {
	if (!Header.RootFile.Name.empty())
	return;
	Header.setRootFile(Directory, FileName, Checksum, Source);
	}

	unsigned getFile(StringRef Directory, StringRef FileName,
	Optional<MD5::MD5Result> Checksum, uint16_t DwarfVersion,
	Optional<StringRef> Source) {
	HasSplitLineTable = true;
	return cantFail(Header.tryGetFile(Directory, FileName, Checksum, Source,
	DwarfVersion));
	}

	void Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params,
	MCSection *Section) const;
	};

	class MCDwarfLineTable {
	MCDwarfLineTableHeader Header;
	MCLineSection MCLineSections;

	public:
	// This emits the Dwarf file and the line tables for all Compile Units.
	static void Emit(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params);

	// This emits the Dwarf file and the line tables for a given Compile Unit.
	void EmitCU(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params,
	Optional<MCDwarfLineStr> &LineStr) const;

	Expected<unsigned> tryGetFile(StringRef &Directory, StringRef &FileName,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source,
	uint16_t DwarfVersion,
	unsigned FileNumber = 0);
	unsigned getFile(StringRef &Directory, StringRef &FileName,
	Optional<MD5::MD5Result> Checksum, Optional<StringRef> Source,
	uint16_t DwarfVersion, unsigned FileNumber = 0) {
	return cantFail(tryGetFile(Directory, FileName, Checksum, Source,
	DwarfVersion, FileNumber));
	}

	void setRootFile(StringRef Directory, StringRef FileName,
	Optional<MD5::MD5Result> Checksum, Optional<StringRef> Source) {
	Header.CompilationDir = std::string(Directory);
	Header.RootFile.Name = std::string(FileName);
	Header.RootFile.DirIndex = 0;
	Header.RootFile.Checksum = Checksum;
	Header.RootFile.Source = Source;
	Header.trackMD5Usage(Checksum.hasValue());
	Header.HasSource = Source.hasValue();
	}

	void resetFileTable() { Header.resetFileTable(); }

	bool hasRootFile() const { return !Header.RootFile.Name.empty(); }

	const MCDwarfFile &getRootFile() const { return Header.RootFile; }

	// Report whether MD5 usage has been consistent (all-or-none).
	bool isMD5UsageConsistent() const { return Header.isMD5UsageConsistent(); }

	MCSymbol *getLabel() const {
	return Header.Label;
	}

	void setLabel(MCSymbol *Label) {
	Header.Label = Label;
	}

	const SmallVectorImpl<std::string> &getMCDwarfDirs() const {
	return Header.MCDwarfDirs;
	}

	SmallVectorImpl<std::string> &getMCDwarfDirs() {
	return Header.MCDwarfDirs;
	}

	const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles() const {
	return Header.MCDwarfFiles;
	}

	SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles() {
	return Header.MCDwarfFiles;
	}

	const MCLineSection &getMCLineSections() const {
	return MCLineSections;
	}
	MCLineSection &getMCLineSections() {
	return MCLineSections;
	}
	};

	class MCDwarfLineAddr {
	public:
	/// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
	static void Encode(MCContext &Context, MCDwarfLineTableParams Params,
	int64_t LineDelta, uint64_t AddrDelta, raw_ostream &OS);

	/// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas using
	/// fixed length operands.
	static bool FixedEncode(MCContext &Context,
	MCDwarfLineTableParams Params,
	int64_t LineDelta, uint64_t AddrDelta,
	raw_ostream &OS, uint32_t Offset, uint32_t Size);

	/// Utility function to emit the encoding to a streamer.
	static void Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
	int64_t LineDelta, uint64_t AddrDelta);
	};

	class MCGenDwarfInfo {
	public:
	//
	// When generating dwarf for assembly source files this emits the Dwarf
	// sections.
	//
	static void Emit(MCStreamer *MCOS);
	};

	// When generating dwarf for assembly source files this is the info that is
	// needed to be gathered for each symbol that will have a dwarf label.
	class MCGenDwarfLabelEntry {
	private:
	// Name of the symbol without a leading underbar, if any.
	StringRef Name;
	// The dwarf file number this symbol is in.
	unsigned FileNumber;
	// The line number this symbol is at.
	unsigned LineNumber;
	// The low_pc for the dwarf label is taken from this symbol.
	MCSymbol *Label;

	public:
	MCGenDwarfLabelEntry(StringRef name, unsigned fileNumber, unsigned lineNumber,
	MCSymbol *label)
	: Name(name), FileNumber(fileNumber), LineNumber(lineNumber),
	Label(label) {}

	StringRef getName() const { return Name; }
	unsigned getFileNumber() const { return FileNumber; }
	unsigned getLineNumber() const { return LineNumber; }
	MCSymbol *getLabel() const { return Label; }

	// This is called when label is created when we are generating dwarf for
	// assembly source files.
	static void Make(MCSymbol Symbol, MCStreamer MCOS, SourceMgr &SrcMgr,
	SMLoc &Loc);
	};

	class MCCFIInstruction {
	public:
	enum OpType {
	OpSameValue,
	OpRememberState,
	OpRestoreState,
	OpOffset,
	OpDefCfaRegister,
	OpDefCfaOffset,
	OpDefCfa,
	OpRelOffset,
	OpAdjustCfaOffset,
	OpEscape,
	OpRestore,
	OpUndefined,
	OpRegister,
	OpWindowSave,
	OpNegateRAState,
	OpGnuArgsSize
	};

	private:
	OpType Operation;
	MCSymbol *Label;
	unsigned Register;
	union {
	int Offset;
	unsigned Register2;
	};
	std::vector<char> Values;
	+ std::string Comment;

	- MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V)
	+ MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V,
	+ StringRef Comment = "")
	: Operation(Op), Label(L), Register(R), Offset(O),
	- Values(V.begin(), V.end()) {
	+ Values(V.begin(), V.end()), Comment(Comment) {
	assert(Op != OpRegister);
	}

	MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R1, unsigned R2)
	: Operation(Op), Label(L), Register(R1), Register2(R2) {
	assert(Op == OpRegister);
	}

	public:
	/// .cfi_def_cfa defines a rule for computing CFA as: take address from
	/// Register and add Offset to it.
	static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register,
	int Offset) {
	return MCCFIInstruction(OpDefCfa, L, Register, Offset, "");
	}

	/// .cfi_def_cfa_register modifies a rule for computing CFA. From now
	/// on Register will be used instead of the old one. Offset remains the same.
	static MCCFIInstruction createDefCfaRegister(MCSymbol *L, unsigned Register) {
	return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, "");
	}

	/// .cfi_def_cfa_offset modifies a rule for computing CFA. Register
	/// remains the same, but offset is new. Note that it is the absolute offset
	/// that will be added to a defined register to the compute CFA address.
	static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int Offset) {
	return MCCFIInstruction(OpDefCfaOffset, L, 0, Offset, "");
	}

	/// .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
	/// Offset is a relative value that is added/subtracted from the previous
	/// offset.
	static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment) {
	return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, "");
	}

	/// .cfi_offset Previous value of Register is saved at offset Offset
	/// from CFA.
	static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register,
	int Offset) {
	return MCCFIInstruction(OpOffset, L, Register, Offset, "");
	}

	/// .cfi_rel_offset Previous value of Register is saved at offset
	/// Offset from the current CFA register. This is transformed to .cfi_offset
	/// using the known displacement of the CFA register from the CFA.
	static MCCFIInstruction createRelOffset(MCSymbol *L, unsigned Register,
	int Offset) {
	return MCCFIInstruction(OpRelOffset, L, Register, Offset, "");
	}

	/// .cfi_register Previous value of Register1 is saved in
	/// register Register2.
	static MCCFIInstruction createRegister(MCSymbol *L, unsigned Register1,
	unsigned Register2) {
	return MCCFIInstruction(OpRegister, L, Register1, Register2);
	}

	/// .cfi_window_save SPARC register window is saved.
	static MCCFIInstruction createWindowSave(MCSymbol *L) {
	return MCCFIInstruction(OpWindowSave, L, 0, 0, "");
	}

	/// .cfi_negate_ra_state AArch64 negate RA state.
	static MCCFIInstruction createNegateRAState(MCSymbol *L) {
	return MCCFIInstruction(OpNegateRAState, L, 0, 0, "");
	}

	/// .cfi_restore says that the rule for Register is now the same as it
	/// was at the beginning of the function, after all initial instructions added
	/// by .cfi_startproc were executed.
	static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register) {
	return MCCFIInstruction(OpRestore, L, Register, 0, "");
	}

	/// .cfi_undefined From now on the previous value of Register can't be
	/// restored anymore.
	static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register) {
	return MCCFIInstruction(OpUndefined, L, Register, 0, "");
	}

	/// .cfi_same_value Current value of Register is the same as in the
	/// previous frame. I.e., no restoration is needed.
	static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register) {
	return MCCFIInstruction(OpSameValue, L, Register, 0, "");
	}

	/// .cfi_remember_state Save all current rules for all registers.
	static MCCFIInstruction createRememberState(MCSymbol *L) {
	return MCCFIInstruction(OpRememberState, L, 0, 0, "");
	}

	/// .cfi_restore_state Restore the previously saved state.
	static MCCFIInstruction createRestoreState(MCSymbol *L) {
	return MCCFIInstruction(OpRestoreState, L, 0, 0, "");
	}

	/// .cfi_escape Allows the user to add arbitrary bytes to the unwind
	/// info.
	- static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals) {
	- return MCCFIInstruction(OpEscape, L, 0, 0, Vals);
	+ static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals,
	+ StringRef Comment = "") {
	+ return MCCFIInstruction(OpEscape, L, 0, 0, Vals, Comment);
	}

	/// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
	static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int Size) {
	return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, "");
	}

	OpType getOperation() const { return Operation; }
	MCSymbol *getLabel() const { return Label; }

	unsigned getRegister() const {
	assert(Operation == OpDefCfa \|\| Operation == OpOffset \|\|
	Operation == OpRestore \|\| Operation == OpUndefined \|\|
	Operation == OpSameValue \|\| Operation == OpDefCfaRegister \|\|
	Operation == OpRelOffset \|\| Operation == OpRegister);
	return Register;
	}

	unsigned getRegister2() const {
	assert(Operation == OpRegister);
	return Register2;
	}

	int getOffset() const {
	assert(Operation == OpDefCfa \|\| Operation == OpOffset \|\|
	Operation == OpRelOffset \|\| Operation == OpDefCfaOffset \|\|
	Operation == OpAdjustCfaOffset \|\| Operation == OpGnuArgsSize);
	return Offset;
	}

	StringRef getValues() const {
	assert(Operation == OpEscape);
	return StringRef(&Values[0], Values.size());
	}
	+
	+ StringRef getComment() const {
	+ return Comment;
	+ }
	};

	struct MCDwarfFrameInfo {
	MCDwarfFrameInfo() = default;

	MCSymbol *Begin = nullptr;
	MCSymbol *End = nullptr;
	const MCSymbol *Personality = nullptr;
	const MCSymbol *Lsda = nullptr;
	std::vector<MCCFIInstruction> Instructions;
	unsigned CurrentCfaRegister = 0;
	unsigned PersonalityEncoding = 0;
	unsigned LsdaEncoding = 0;
	uint32_t CompactUnwindEncoding = 0;
	bool IsSignalFrame = false;
	bool IsSimple = false;
	unsigned RAReg = static_cast<unsigned>(INT_MAX);
	bool IsBKeyFrame = false;
	};

	class MCDwarfFrameEmitter {
	public:
	//
	// This emits the frame info section.
	//
	static void Emit(MCObjectStreamer &streamer, MCAsmBackend *MAB, bool isEH);
	static void EmitAdvanceLoc(MCObjectStreamer &Streamer, uint64_t AddrDelta);
	static void EncodeAdvanceLoc(MCContext &Context, uint64_t AddrDelta,
	raw_ostream &OS, uint32_t *Offset = nullptr,
	uint32_t *Size = nullptr);
	};

	} // end namespace llvm

	#endif // LLVM_MC_MCDWARF_H
	diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
	index d81a9be26d39..b6a9a9568360 100644
	--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
	+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
	@@ -1,296 +1,297 @@
	//===-- AsmPrinterDwarf.cpp - AsmPrinter Dwarf Support --------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the Dwarf emissions parts of AsmPrinter.
	//
	//===----------------------------------------------------------------------===//

	#include "ByteStreamer.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	#include "llvm/CodeGen/AsmPrinter.h"
	#include "llvm/CodeGen/DIE.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MachineLocation.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"
	#include "llvm/Target/TargetMachine.h"
	using namespace llvm;

	#define DEBUG_TYPE "asm-printer"

	//===----------------------------------------------------------------------===//
	// Dwarf Emission Helper Routines
	//===----------------------------------------------------------------------===//

	/// EmitSLEB128 - emit the specified signed leb128 value.
	void AsmPrinter::emitSLEB128(int64_t Value, const char *Desc) const {
	if (isVerbose() && Desc)
	OutStreamer->AddComment(Desc);

	OutStreamer->emitSLEB128IntValue(Value);
	}

	void AsmPrinter::emitULEB128(uint64_t Value, const char *Desc,
	unsigned PadTo) const {
	if (isVerbose() && Desc)
	OutStreamer->AddComment(Desc);

	OutStreamer->emitULEB128IntValue(Value, PadTo);
	}

	/// Emit something like ".uleb128 Hi-Lo".
	void AsmPrinter::emitLabelDifferenceAsULEB128(const MCSymbol *Hi,
	const MCSymbol *Lo) const {
	OutStreamer->emitAbsoluteSymbolDiffAsULEB128(Hi, Lo);
	}

	static const char *DecodeDWARFEncoding(unsigned Encoding) {
	switch (Encoding) {
	case dwarf::DW_EH_PE_absptr:
	return "absptr";
	case dwarf::DW_EH_PE_omit:
	return "omit";
	case dwarf::DW_EH_PE_pcrel:
	return "pcrel";
	case dwarf::DW_EH_PE_uleb128:
	return "uleb128";
	case dwarf::DW_EH_PE_sleb128:
	return "sleb128";
	case dwarf::DW_EH_PE_udata4:
	return "udata4";
	case dwarf::DW_EH_PE_udata8:
	return "udata8";
	case dwarf::DW_EH_PE_sdata4:
	return "sdata4";
	case dwarf::DW_EH_PE_sdata8:
	return "sdata8";
	case dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_udata4:
	return "pcrel udata4";
	case dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_sdata4:
	return "pcrel sdata4";
	case dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_udata8:
	return "pcrel udata8";
	case dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_sdata8:
	return "pcrel sdata8";
	case dwarf::DW_EH_PE_indirect \| dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_udata4
	:
	return "indirect pcrel udata4";
	case dwarf::DW_EH_PE_indirect \| dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_sdata4
	:
	return "indirect pcrel sdata4";
	case dwarf::DW_EH_PE_indirect \| dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_udata8
	:
	return "indirect pcrel udata8";
	case dwarf::DW_EH_PE_indirect \| dwarf::DW_EH_PE_pcrel \| dwarf::DW_EH_PE_sdata8
	:
	return "indirect pcrel sdata8";
	}

	return "<unknown encoding>";
	}

	/// EmitEncodingByte - Emit a .byte 42 directive that corresponds to an
	/// encoding. If verbose assembly output is enabled, we output comments
	/// describing the encoding. Desc is an optional string saying what the
	/// encoding is specifying (e.g. "LSDA").
	void AsmPrinter::emitEncodingByte(unsigned Val, const char *Desc) const {
	if (isVerbose()) {
	if (Desc)
	OutStreamer->AddComment(Twine(Desc) + " Encoding = " +
	Twine(DecodeDWARFEncoding(Val)));
	else
	OutStreamer->AddComment(Twine("Encoding = ") + DecodeDWARFEncoding(Val));
	}

	OutStreamer->emitIntValue(Val, 1);
	}

	/// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
	unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
	if (Encoding == dwarf::DW_EH_PE_omit)
	return 0;

	switch (Encoding & 0x07) {
	default:
	llvm_unreachable("Invalid encoded value.");
	case dwarf::DW_EH_PE_absptr:
	return MF->getDataLayout().getPointerSize();
	case dwarf::DW_EH_PE_udata2:
	return 2;
	case dwarf::DW_EH_PE_udata4:
	return 4;
	case dwarf::DW_EH_PE_udata8:
	return 8;
	}
	}

	void AsmPrinter::emitTTypeReference(const GlobalValue *GV,
	unsigned Encoding) const {
	if (GV) {
	const TargetLoweringObjectFile &TLOF = getObjFileLowering();

	const MCExpr *Exp =
	TLOF.getTTypeGlobalReference(GV, Encoding, TM, MMI, *OutStreamer);
	OutStreamer->emitValue(Exp, GetSizeOfEncodedValue(Encoding));
	} else
	OutStreamer->emitIntValue(0, GetSizeOfEncodedValue(Encoding));
	}

	void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
	bool ForceOffset) const {
	if (!ForceOffset) {
	// On COFF targets, we have to emit the special .secrel32 directive.
	if (MAI->needsDwarfSectionOffsetDirective()) {
	OutStreamer->EmitCOFFSecRel32(Label, /Offset=/0);
	return;
	}

	// If the format uses relocations with dwarf, refer to the symbol directly.
	if (MAI->doesDwarfUseRelocationsAcrossSections()) {
	OutStreamer->emitSymbolValue(Label, 4);
	return;
	}
	}

	// Otherwise, emit it as a label difference from the start of the section.
	emitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4);
	}

	void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
	if (MAI->doesDwarfUseRelocationsAcrossSections()) {
	assert(S.Symbol && "No symbol available");
	emitDwarfSymbolReference(S.Symbol);
	return;
	}

	// Just emit the offset directly; no need for symbol math.
	emitInt32(S.Offset);
	}

	void AsmPrinter::emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const {
	// TODO: Support DWARF64
	emitLabelPlusOffset(Label, Offset, 4);
	}

	void AsmPrinter::emitCallSiteOffset(const MCSymbol Hi, const MCSymbol Lo,
	unsigned Encoding) const {
	// The least significant 3 bits specify the width of the encoding
	if ((Encoding & 0x7) == dwarf::DW_EH_PE_uleb128)
	emitLabelDifferenceAsULEB128(Hi, Lo);
	else
	emitLabelDifference(Hi, Lo, GetSizeOfEncodedValue(Encoding));
	}

	void AsmPrinter::emitCallSiteValue(uint64_t Value, unsigned Encoding) const {
	// The least significant 3 bits specify the width of the encoding
	if ((Encoding & 0x7) == dwarf::DW_EH_PE_uleb128)
	emitULEB128(Value);
	else
	OutStreamer->emitIntValue(Value, GetSizeOfEncodedValue(Encoding));
	}

	//===----------------------------------------------------------------------===//
	// Dwarf Lowering Routines
	//===----------------------------------------------------------------------===//

	void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
	switch (Inst.getOperation()) {
	default:
	llvm_unreachable("Unexpected instruction");
	case MCCFIInstruction::OpDefCfaOffset:
	OutStreamer->emitCFIDefCfaOffset(Inst.getOffset());
	break;
	case MCCFIInstruction::OpAdjustCfaOffset:
	OutStreamer->emitCFIAdjustCfaOffset(Inst.getOffset());
	break;
	case MCCFIInstruction::OpDefCfa:
	OutStreamer->emitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
	break;
	case MCCFIInstruction::OpDefCfaRegister:
	OutStreamer->emitCFIDefCfaRegister(Inst.getRegister());
	break;
	case MCCFIInstruction::OpOffset:
	OutStreamer->emitCFIOffset(Inst.getRegister(), Inst.getOffset());
	break;
	case MCCFIInstruction::OpRegister:
	OutStreamer->emitCFIRegister(Inst.getRegister(), Inst.getRegister2());
	break;
	case MCCFIInstruction::OpWindowSave:
	OutStreamer->emitCFIWindowSave();
	break;
	case MCCFIInstruction::OpNegateRAState:
	OutStreamer->emitCFINegateRAState();
	break;
	case MCCFIInstruction::OpSameValue:
	OutStreamer->emitCFISameValue(Inst.getRegister());
	break;
	case MCCFIInstruction::OpGnuArgsSize:
	OutStreamer->emitCFIGnuArgsSize(Inst.getOffset());
	break;
	case MCCFIInstruction::OpEscape:
	+ OutStreamer->AddComment(Inst.getComment());
	OutStreamer->emitCFIEscape(Inst.getValues());
	break;
	case MCCFIInstruction::OpRestore:
	OutStreamer->emitCFIRestore(Inst.getRegister());
	break;
	case MCCFIInstruction::OpUndefined:
	OutStreamer->emitCFIUndefined(Inst.getRegister());
	break;
	}
	}

	void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
	// Emit the code (index) for the abbreviation.
	if (isVerbose())
	OutStreamer->AddComment("Abbrev [" + Twine(Die.getAbbrevNumber()) + "] 0x" +
	Twine::utohexstr(Die.getOffset()) + ":0x" +
	Twine::utohexstr(Die.getSize()) + " " +
	dwarf::TagString(Die.getTag()));
	emitULEB128(Die.getAbbrevNumber());

	// Emit the DIE attribute values.
	for (const auto &V : Die.values()) {
	dwarf::Attribute Attr = V.getAttribute();
	assert(V.getForm() && "Too many attributes for DIE (check abbreviation)");

	if (isVerbose()) {
	OutStreamer->AddComment(dwarf::AttributeString(Attr));
	if (Attr == dwarf::DW_AT_accessibility)
	OutStreamer->AddComment(
	dwarf::AccessibilityString(V.getDIEInteger().getValue()));
	}

	// Emit an attribute using the defined form.
	V.emitValue(this);
	}

	// Emit the DIE children if any.
	if (Die.hasChildren()) {
	for (auto &Child : Die.children())
	emitDwarfDIE(Child);

	OutStreamer->AddComment("End Of Children Mark");
	emitInt8(0);
	}
	}

	void AsmPrinter::emitDwarfAbbrev(const DIEAbbrev &Abbrev) const {
	// Emit the abbreviations code (base 1 index.)
	emitULEB128(Abbrev.getNumber(), "Abbreviation Code");

	// Emit the abbreviations data.
	Abbrev.Emit(this);
	}
	diff --git a/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp
	index 572d1203aaf2..c629f872df12 100644
	--- a/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp
	+++ b/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp
	@@ -1,595 +1,614 @@
	//===-- X86TargetParser - Parser for X86 features ---------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a target parser to recognise X86 hardware features.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Support/X86TargetParser.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"

	using namespace llvm;
	using namespace llvm::X86;

	namespace {

	/// Container class for CPU features.
	/// This is a constexpr reimplementation of a subset of std::bitset. It would be
	/// nice to use std::bitset directly, but it doesn't support constant
	/// initialization.
	class FeatureBitset {
	static constexpr unsigned NUM_FEATURE_WORDS =
	(X86::CPU_FEATURE_MAX + 31) / 32;

	// This cannot be a std::array, operator[] is not constexpr until C++17.
	uint32_t Bits[NUM_FEATURE_WORDS] = {};

	public:
	constexpr FeatureBitset() = default;
	constexpr FeatureBitset(std::initializer_list<unsigned> Init) {
	for (auto I : Init)
	set(I);
	}

	+ bool any() const {
	+ return llvm::any_of(Bits, [](uint64_t V) { return V != 0; });
	+ }
	+
	constexpr FeatureBitset &set(unsigned I) {
	// GCC <6.2 crashes if this is written in a single statement.
	uint32_t NewBits = Bits[I / 32] \| (uint32_t(1) << (I % 32));
	Bits[I / 32] = NewBits;
	return *this;
	}

	constexpr bool operator[](unsigned I) const {
	uint32_t Mask = uint32_t(1) << (I % 32);
	return (Bits[I / 32] & Mask) != 0;
	}

	constexpr FeatureBitset &operator&=(const FeatureBitset &RHS) {
	for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
	// GCC <6.2 crashes if this is written in a single statement.
	uint32_t NewBits = Bits[I] & RHS.Bits[I];
	Bits[I] = NewBits;
	}
	return *this;
	}

	constexpr FeatureBitset &operator\|=(const FeatureBitset &RHS) {
	for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
	// GCC <6.2 crashes if this is written in a single statement.
	uint32_t NewBits = Bits[I] \| RHS.Bits[I];
	Bits[I] = NewBits;
	}
	return *this;
	}

	// gcc 5.3 miscompiles this if we try to write this using operator&=.
	constexpr FeatureBitset operator&(const FeatureBitset &RHS) const {
	FeatureBitset Result;
	for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
	Result.Bits[I] = Bits[I] & RHS.Bits[I];
	return Result;
	}

	// gcc 5.3 miscompiles this if we try to write this using operator&=.
	constexpr FeatureBitset operator\|(const FeatureBitset &RHS) const {
	FeatureBitset Result;
	for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
	Result.Bits[I] = Bits[I] \| RHS.Bits[I];
	return Result;
	}

	constexpr FeatureBitset operator~() const {
	FeatureBitset Result;
	for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
	Result.Bits[I] = ~Bits[I];
	return Result;
	}
	+
	+ constexpr bool operator!=(const FeatureBitset &RHS) const {
	+ for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
	+ if (Bits[I] != RHS.Bits[I])
	+ return true;
	+ return false;
	+ }
	};

	struct ProcInfo {
	StringLiteral Name;
	X86::CPUKind Kind;
	unsigned KeyFeature;
	FeatureBitset Features;
	};

	struct FeatureInfo {
	StringLiteral Name;
	FeatureBitset ImpliedFeatures;
	};

	} // end anonymous namespace

	#define X86_FEATURE(ENUM, STRING) \
	static constexpr FeatureBitset Feature##ENUM = {X86::FEATURE_##ENUM};
	#include "llvm/Support/X86TargetParser.def"

	// Pentium with MMX.
	static constexpr FeatureBitset FeaturesPentiumMMX =
	FeatureX87 \| FeatureCMPXCHG8B \| FeatureMMX;

	// Pentium 2 and 3.
	static constexpr FeatureBitset FeaturesPentium2 =
	FeatureX87 \| FeatureCMPXCHG8B \| FeatureMMX \| FeatureFXSR;
	static constexpr FeatureBitset FeaturesPentium3 = FeaturesPentium2 \| FeatureSSE;

	// Pentium 4 CPUs
	static constexpr FeatureBitset FeaturesPentium4 =
	FeaturesPentium3 \| FeatureSSE2;
	static constexpr FeatureBitset FeaturesPrescott =
	FeaturesPentium4 \| FeatureSSE3;
	static constexpr FeatureBitset FeaturesNocona =
	FeaturesPrescott \| Feature64BIT \| FeatureCMPXCHG16B;

	// Basic 64-bit capable CPU.
	static constexpr FeatureBitset FeaturesX86_64 = FeaturesPentium4 \| Feature64BIT;

	// Intel Core CPUs
	static constexpr FeatureBitset FeaturesCore2 =
	FeaturesNocona \| FeatureSAHF \| FeatureSSSE3;
	static constexpr FeatureBitset FeaturesPenryn = FeaturesCore2 \| FeatureSSE4_1;
	static constexpr FeatureBitset FeaturesNehalem =
	FeaturesPenryn \| FeaturePOPCNT \| FeatureSSE4_2;
	static constexpr FeatureBitset FeaturesWestmere =
	FeaturesNehalem \| FeaturePCLMUL;
	static constexpr FeatureBitset FeaturesSandyBridge =
	FeaturesWestmere \| FeatureAVX \| FeatureXSAVE \| FeatureXSAVEOPT;
	static constexpr FeatureBitset FeaturesIvyBridge =
	FeaturesSandyBridge \| FeatureF16C \| FeatureFSGSBASE \| FeatureRDRND;
	static constexpr FeatureBitset FeaturesHaswell =
	FeaturesIvyBridge \| FeatureAVX2 \| FeatureBMI \| FeatureBMI2 \| FeatureFMA \|
	FeatureINVPCID \| FeatureLZCNT \| FeatureMOVBE;
	static constexpr FeatureBitset FeaturesBroadwell =
	FeaturesHaswell \| FeatureADX \| FeaturePRFCHW \| FeatureRDSEED;

	// Intel Knights Landing and Knights Mill
	// Knights Landing has feature parity with Broadwell.
	static constexpr FeatureBitset FeaturesKNL =
	FeaturesBroadwell \| FeatureAES \| FeatureAVX512F \| FeatureAVX512CD \|
	FeatureAVX512ER \| FeatureAVX512PF \| FeaturePREFETCHWT1;
	static constexpr FeatureBitset FeaturesKNM =
	FeaturesKNL \| FeatureAVX512VPOPCNTDQ;

	// Intel Skylake processors.
	static constexpr FeatureBitset FeaturesSkylakeClient =
	FeaturesBroadwell \| FeatureAES \| FeatureCLFLUSHOPT \| FeatureXSAVEC \|
	FeatureXSAVES \| FeatureSGX;
	// SkylakeServer inherits all SkylakeClient features except SGX.
	// FIXME: That doesn't match gcc.
	static constexpr FeatureBitset FeaturesSkylakeServer =
	(FeaturesSkylakeClient & ~FeatureSGX) \| FeatureAVX512F \| FeatureAVX512CD \|
	FeatureAVX512DQ \| FeatureAVX512BW \| FeatureAVX512VL \| FeatureCLWB \|
	FeaturePKU;
	static constexpr FeatureBitset FeaturesCascadeLake =
	FeaturesSkylakeServer \| FeatureAVX512VNNI;
	static constexpr FeatureBitset FeaturesCooperLake =
	FeaturesCascadeLake \| FeatureAVX512BF16;

	// Intel 10nm processors.
	static constexpr FeatureBitset FeaturesCannonlake =
	FeaturesSkylakeClient \| FeatureAVX512F \| FeatureAVX512CD \| FeatureAVX512DQ \|
	FeatureAVX512BW \| FeatureAVX512VL \| FeatureAVX512IFMA \| FeatureAVX512VBMI \|
	FeaturePKU \| FeatureSHA;
	static constexpr FeatureBitset FeaturesICLClient =
	FeaturesCannonlake \| FeatureAVX512BITALG \| FeatureAVX512VBMI2 \|
	FeatureAVX512VNNI \| FeatureAVX512VPOPCNTDQ \| FeatureCLWB \| FeatureGFNI \|
	FeatureRDPID \| FeatureVAES \| FeatureVPCLMULQDQ;
	static constexpr FeatureBitset FeaturesICLServer =
	FeaturesICLClient \| FeaturePCONFIG \| FeatureWBNOINVD;
	static constexpr FeatureBitset FeaturesTigerlake =
	FeaturesICLClient \| FeatureAVX512VP2INTERSECT \| FeatureMOVDIR64B \|
	FeatureMOVDIRI \| FeatureSHSTK;

	// Intel Atom processors.
	// Bonnell has feature parity with Core2 and adds MOVBE.
	static constexpr FeatureBitset FeaturesBonnell = FeaturesCore2 \| FeatureMOVBE;
	// Silvermont has parity with Westmere and Bonnell plus PRFCHW and RDRND.
	static constexpr FeatureBitset FeaturesSilvermont =
	FeaturesBonnell \| FeaturesWestmere \| FeaturePRFCHW \| FeatureRDRND;
	static constexpr FeatureBitset FeaturesGoldmont =
	FeaturesSilvermont \| FeatureAES \| FeatureCLFLUSHOPT \| FeatureFSGSBASE \|
	FeatureRDSEED \| FeatureSHA \| FeatureXSAVE \| FeatureXSAVEC \|
	FeatureXSAVEOPT \| FeatureXSAVES;
	static constexpr FeatureBitset FeaturesGoldmontPlus =
	FeaturesGoldmont \| FeaturePTWRITE \| FeatureRDPID \| FeatureSGX;
	static constexpr FeatureBitset FeaturesTremont =
	FeaturesGoldmontPlus \| FeatureCLWB \| FeatureGFNI;

	// Geode Processor.
	static constexpr FeatureBitset FeaturesGeode =
	FeatureX87 \| FeatureCMPXCHG8B \| FeatureMMX \| Feature3DNOW \| Feature3DNOWA;

	// K6 processor.
	static constexpr FeatureBitset FeaturesK6 =
	FeatureX87 \| FeatureCMPXCHG8B \| FeatureMMX;

	// K7 and K8 architecture processors.
	static constexpr FeatureBitset FeaturesAthlon =
	FeatureX87 \| FeatureCMPXCHG8B \| FeatureMMX \| Feature3DNOW \| Feature3DNOWA;
	static constexpr FeatureBitset FeaturesAthlonXP =
	FeaturesAthlon \| FeatureFXSR \| FeatureSSE;
	static constexpr FeatureBitset FeaturesK8 =
	FeaturesAthlonXP \| FeatureSSE2 \| Feature64BIT;
	static constexpr FeatureBitset FeaturesK8SSE3 = FeaturesK8 \| FeatureSSE3;
	static constexpr FeatureBitset FeaturesAMDFAM10 =
	FeaturesK8SSE3 \| FeatureCMPXCHG16B \| FeatureLZCNT \| FeaturePOPCNT \|
	FeaturePRFCHW \| FeatureSAHF \| FeatureSSE4_A;

	// Bobcat architecture processors.
	static constexpr FeatureBitset FeaturesBTVER1 =
	FeatureX87 \| FeatureCMPXCHG8B \| FeatureCMPXCHG16B \| Feature64BIT \|
	FeatureFXSR \| FeatureLZCNT \| FeatureMMX \| FeaturePOPCNT \| FeaturePRFCHW \|
	FeatureSSE \| FeatureSSE2 \| FeatureSSE3 \| FeatureSSSE3 \| FeatureSSE4_A \|
	FeatureSAHF;
	static constexpr FeatureBitset FeaturesBTVER2 =
	FeaturesBTVER1 \| FeatureAES \| FeatureAVX \| FeatureBMI \| FeatureF16C \|
	FeatureMOVBE \| FeaturePCLMUL \| FeatureXSAVE \| FeatureXSAVEOPT;

	// AMD Bulldozer architecture processors.
	static constexpr FeatureBitset FeaturesBDVER1 =
	FeatureX87 \| FeatureAES \| FeatureAVX \| FeatureCMPXCHG8B \|
	FeatureCMPXCHG16B \| Feature64BIT \| FeatureFMA4 \| FeatureFXSR \| FeatureLWP \|
	FeatureLZCNT \| FeatureMMX \| FeaturePCLMUL \| FeaturePOPCNT \| FeaturePRFCHW \|
	FeatureSAHF \| FeatureSSE \| FeatureSSE2 \| FeatureSSE3 \| FeatureSSSE3 \|
	FeatureSSE4_1 \| FeatureSSE4_2 \| FeatureSSE4_A \| FeatureXOP \| FeatureXSAVE;
	static constexpr FeatureBitset FeaturesBDVER2 =
	FeaturesBDVER1 \| FeatureBMI \| FeatureFMA \| FeatureF16C \| FeatureTBM;
	static constexpr FeatureBitset FeaturesBDVER3 =
	FeaturesBDVER2 \| FeatureFSGSBASE \| FeatureXSAVEOPT;
	static constexpr FeatureBitset FeaturesBDVER4 =
	FeaturesBDVER3 \| FeatureAVX2 \| FeatureBMI2 \| FeatureMOVBE \| FeatureMWAITX \|
	FeatureRDRND;

	// AMD Zen architecture processors.
	static constexpr FeatureBitset FeaturesZNVER1 =
	FeatureX87 \| FeatureADX \| FeatureAES \| FeatureAVX \| FeatureAVX2 \|
	FeatureBMI \| FeatureBMI2 \| FeatureCLFLUSHOPT \| FeatureCLZERO \|
	FeatureCMPXCHG8B \| FeatureCMPXCHG16B \| Feature64BIT \| FeatureF16C \|
	FeatureFMA \| FeatureFSGSBASE \| FeatureFXSR \| FeatureLZCNT \| FeatureMMX \|
	FeatureMOVBE \| FeatureMWAITX \| FeaturePCLMUL \| FeaturePOPCNT \|
	FeaturePRFCHW \| FeatureRDRND \| FeatureRDSEED \| FeatureSAHF \| FeatureSHA \|
	FeatureSSE \| FeatureSSE2 \| FeatureSSE3 \| FeatureSSSE3 \| FeatureSSE4_1 \|
	FeatureSSE4_2 \| FeatureSSE4_A \| FeatureXSAVE \| FeatureXSAVEC \|
	FeatureXSAVEOPT \| FeatureXSAVES;
	static constexpr FeatureBitset FeaturesZNVER2 =
	FeaturesZNVER1 \| FeatureCLWB \| FeatureRDPID \| FeatureWBNOINVD;

	static constexpr ProcInfo Processors[] = {
	// Empty processor. Include X87 and CMPXCHG8 for backwards compatibility.
	{ {""}, CK_None, ~0U, FeatureX87 \| FeatureCMPXCHG8B },
	// i386-generation processors.
	{ {"i386"}, CK_i386, ~0U, FeatureX87 },
	// i486-generation processors.
	{ {"i486"}, CK_i486, ~0U, FeatureX87 },
	{ {"winchip-c6"}, CK_WinChipC6, ~0U, FeaturesPentiumMMX },
	{ {"winchip2"}, CK_WinChip2, ~0U, FeaturesPentiumMMX \| Feature3DNOW },
	{ {"c3"}, CK_C3, ~0U, FeaturesPentiumMMX \| Feature3DNOW },
	// i586-generation processors, P5 microarchitecture based.
	{ {"i586"}, CK_i586, ~0U, FeatureX87 \| FeatureCMPXCHG8B },
	{ {"pentium"}, CK_Pentium, ~0U, FeatureX87 \| FeatureCMPXCHG8B },
	{ {"pentium-mmx"}, CK_PentiumMMX, ~0U, FeaturesPentiumMMX },
	// i686-generation processors, P6 / Pentium M microarchitecture based.
	{ {"pentiumpro"}, CK_PentiumPro, ~0U, FeatureX87 \| FeatureCMPXCHG8B },
	{ {"i686"}, CK_i686, ~0U, FeatureX87 \| FeatureCMPXCHG8B },
	{ {"pentium2"}, CK_Pentium2, ~0U, FeaturesPentium2 },
	{ {"pentium3"}, CK_Pentium3, ~0U, FeaturesPentium3 },
	{ {"pentium3m"}, CK_Pentium3, ~0U, FeaturesPentium3 },
	{ {"pentium-m"}, CK_PentiumM, ~0U, FeaturesPentium4 },
	{ {"c3-2"}, CK_C3_2, ~0U, FeaturesPentium3 },
	{ {"yonah"}, CK_Yonah, ~0U, FeaturesPrescott },
	// Netburst microarchitecture based processors.
	{ {"pentium4"}, CK_Pentium4, ~0U, FeaturesPentium4 },
	{ {"pentium4m"}, CK_Pentium4, ~0U, FeaturesPentium4 },
	{ {"prescott"}, CK_Prescott, ~0U, FeaturesPrescott },
	{ {"nocona"}, CK_Nocona, ~0U, FeaturesNocona },
	// Core microarchitecture based processors.
	{ {"core2"}, CK_Core2, ~0U, FeaturesCore2 },
	{ {"penryn"}, CK_Penryn, ~0U, FeaturesPenryn },
	// Atom processors
	{ {"bonnell"}, CK_Bonnell, FEATURE_SSSE3, FeaturesBonnell },
	{ {"atom"}, CK_Bonnell, FEATURE_SSSE3, FeaturesBonnell },
	{ {"silvermont"}, CK_Silvermont, FEATURE_SSE4_2, FeaturesSilvermont },
	{ {"slm"}, CK_Silvermont, FEATURE_SSE4_2, FeaturesSilvermont },
	{ {"goldmont"}, CK_Goldmont, FEATURE_SSE4_2, FeaturesGoldmont },
	{ {"goldmont-plus"}, CK_GoldmontPlus, FEATURE_SSE4_2, FeaturesGoldmontPlus },
	{ {"tremont"}, CK_Tremont, FEATURE_SSE4_2, FeaturesTremont },
	// Nehalem microarchitecture based processors.
	{ {"nehalem"}, CK_Nehalem, FEATURE_SSE4_2, FeaturesNehalem },
	{ {"corei7"}, CK_Nehalem, FEATURE_SSE4_2, FeaturesNehalem },
	// Westmere microarchitecture based processors.
	{ {"westmere"}, CK_Westmere, FEATURE_PCLMUL, FeaturesWestmere },
	// Sandy Bridge microarchitecture based processors.
	{ {"sandybridge"}, CK_SandyBridge, FEATURE_AVX, FeaturesSandyBridge },
	{ {"corei7-avx"}, CK_SandyBridge, FEATURE_AVX, FeaturesSandyBridge },
	// Ivy Bridge microarchitecture based processors.
	{ {"ivybridge"}, CK_IvyBridge, FEATURE_AVX, FeaturesIvyBridge },
	{ {"core-avx-i"}, CK_IvyBridge, FEATURE_AVX, FeaturesIvyBridge },
	// Haswell microarchitecture based processors.
	{ {"haswell"}, CK_Haswell, FEATURE_AVX2, FeaturesHaswell },
	{ {"core-avx2"}, CK_Haswell, FEATURE_AVX2, FeaturesHaswell },
	// Broadwell microarchitecture based processors.
	{ {"broadwell"}, CK_Broadwell, FEATURE_AVX2, FeaturesBroadwell },
	// Skylake client microarchitecture based processors.
	{ {"skylake"}, CK_SkylakeClient, FEATURE_AVX2, FeaturesSkylakeClient },
	// Skylake server microarchitecture based processors.
	{ {"skylake-avx512"}, CK_SkylakeServer, FEATURE_AVX512F, FeaturesSkylakeServer },
	{ {"skx"}, CK_SkylakeServer, FEATURE_AVX512F, FeaturesSkylakeServer },
	// Cascadelake Server microarchitecture based processors.
	{ {"cascadelake"}, CK_Cascadelake, FEATURE_AVX512VNNI, FeaturesCascadeLake },
	// Cooperlake Server microarchitecture based processors.
	{ {"cooperlake"}, CK_Cooperlake, FEATURE_AVX512BF16, FeaturesCooperLake },
	// Cannonlake client microarchitecture based processors.
	{ {"cannonlake"}, CK_Cannonlake, FEATURE_AVX512VBMI, FeaturesCannonlake },
	// Icelake client microarchitecture based processors.
	{ {"icelake-client"}, CK_IcelakeClient, FEATURE_AVX512VBMI2, FeaturesICLClient },
	// Icelake server microarchitecture based processors.
	{ {"icelake-server"}, CK_IcelakeServer, FEATURE_AVX512VBMI2, FeaturesICLServer },
	// Tigerlake microarchitecture based processors.
	{ {"tigerlake"}, CK_Tigerlake, FEATURE_AVX512VP2INTERSECT, FeaturesTigerlake },
	// Knights Landing processor.
	{ {"knl"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL },
	// Knights Mill processor.
	{ {"knm"}, CK_KNM, FEATURE_AVX5124FMAPS, FeaturesKNM },
	// Lakemont microarchitecture based processors.
	{ {"lakemont"}, CK_Lakemont, ~0U, FeatureCMPXCHG8B },
	// K6 architecture processors.
	{ {"k6"}, CK_K6, ~0U, FeaturesK6 },
	{ {"k6-2"}, CK_K6_2, ~0U, FeaturesK6 \| Feature3DNOW },
	{ {"k6-3"}, CK_K6_3, ~0U, FeaturesK6 \| Feature3DNOW },
	// K7 architecture processors.
	{ {"athlon"}, CK_Athlon, ~0U, FeaturesAthlon },
	{ {"athlon-tbird"}, CK_Athlon, ~0U, FeaturesAthlon },
	{ {"athlon-xp"}, CK_AthlonXP, ~0U, FeaturesAthlonXP },
	{ {"athlon-mp"}, CK_AthlonXP, ~0U, FeaturesAthlonXP },
	{ {"athlon-4"}, CK_AthlonXP, ~0U, FeaturesAthlonXP },
	// K8 architecture processors.
	{ {"k8"}, CK_K8, ~0U, FeaturesK8 },
	{ {"athlon64"}, CK_K8, ~0U, FeaturesK8 },
	{ {"athlon-fx"}, CK_K8, ~0U, FeaturesK8 },
	{ {"opteron"}, CK_K8, ~0U, FeaturesK8 },
	{ {"k8-sse3"}, CK_K8SSE3, ~0U, FeaturesK8SSE3 },
	{ {"athlon64-sse3"}, CK_K8SSE3, ~0U, FeaturesK8SSE3 },
	{ {"opteron-sse3"}, CK_K8SSE3, ~0U, FeaturesK8SSE3 },
	{ {"amdfam10"}, CK_AMDFAM10, FEATURE_SSE4_A, FeaturesAMDFAM10 },
	{ {"barcelona"}, CK_AMDFAM10, FEATURE_SSE4_A, FeaturesAMDFAM10 },
	// Bobcat architecture processors.
	{ {"btver1"}, CK_BTVER1, FEATURE_SSE4_A, FeaturesBTVER1 },
	{ {"btver2"}, CK_BTVER2, FEATURE_BMI, FeaturesBTVER2 },
	// Bulldozer architecture processors.
	{ {"bdver1"}, CK_BDVER1, FEATURE_XOP, FeaturesBDVER1 },
	{ {"bdver2"}, CK_BDVER2, FEATURE_FMA, FeaturesBDVER2 },
	{ {"bdver3"}, CK_BDVER3, FEATURE_FMA, FeaturesBDVER3 },
	{ {"bdver4"}, CK_BDVER4, FEATURE_AVX2, FeaturesBDVER4 },
	// Zen architecture processors.
	{ {"znver1"}, CK_ZNVER1, FEATURE_AVX2, FeaturesZNVER1 },
	{ {"znver2"}, CK_ZNVER2, FEATURE_AVX2, FeaturesZNVER2 },
	// Generic 64-bit processor.
	{ {"x86-64"}, CK_x86_64, ~0U, FeaturesX86_64 },
	// Geode processors.
	{ {"geode"}, CK_Geode, ~0U, FeaturesGeode },
	};

	X86::CPUKind llvm::X86::parseArchX86(StringRef CPU, bool Only64Bit) {
	for (const auto &P : Processors)
	if (P.Name == CPU && (P.Features[FEATURE_64BIT] \|\| !Only64Bit))
	return P.Kind;

	return CK_None;
	}

	void llvm::X86::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
	bool Only64Bit) {
	for (const auto &P : Processors)
	if (!P.Name.empty() && (P.Features[FEATURE_64BIT] \|\| !Only64Bit))
	Values.emplace_back(P.Name);
	}

	ProcessorFeatures llvm::X86::getKeyFeature(X86::CPUKind Kind) {
	// FIXME: Can we avoid a linear search here? The table might be sorted by
	// CPUKind so we could binary search?
	for (const auto &P : Processors) {
	if (P.Kind == Kind) {
	assert(P.KeyFeature != ~0U && "Processor does not have a key feature.");
	return static_cast<ProcessorFeatures>(P.KeyFeature);
	}
	}

	llvm_unreachable("Unable to find CPU kind!");
	}

	// Features with no dependencies.
	static constexpr FeatureBitset ImpliedFeatures64BIT = {};
	static constexpr FeatureBitset ImpliedFeaturesADX = {};
	static constexpr FeatureBitset ImpliedFeaturesBMI = {};
	static constexpr FeatureBitset ImpliedFeaturesBMI2 = {};
	static constexpr FeatureBitset ImpliedFeaturesCLDEMOTE = {};
	static constexpr FeatureBitset ImpliedFeaturesCLFLUSHOPT = {};
	static constexpr FeatureBitset ImpliedFeaturesCLWB = {};
	static constexpr FeatureBitset ImpliedFeaturesCLZERO = {};
	static constexpr FeatureBitset ImpliedFeaturesCMOV = {};
	static constexpr FeatureBitset ImpliedFeaturesCMPXCHG16B = {};
	static constexpr FeatureBitset ImpliedFeaturesCMPXCHG8B = {};
	static constexpr FeatureBitset ImpliedFeaturesENQCMD = {};
	static constexpr FeatureBitset ImpliedFeaturesFSGSBASE = {};
	static constexpr FeatureBitset ImpliedFeaturesFXSR = {};
	static constexpr FeatureBitset ImpliedFeaturesINVPCID = {};
	static constexpr FeatureBitset ImpliedFeaturesLWP = {};
	static constexpr FeatureBitset ImpliedFeaturesLZCNT = {};
	static constexpr FeatureBitset ImpliedFeaturesMWAITX = {};
	static constexpr FeatureBitset ImpliedFeaturesMOVBE = {};
	static constexpr FeatureBitset ImpliedFeaturesMOVDIR64B = {};
	static constexpr FeatureBitset ImpliedFeaturesMOVDIRI = {};
	static constexpr FeatureBitset ImpliedFeaturesPCONFIG = {};
	static constexpr FeatureBitset ImpliedFeaturesPOPCNT = {};
	static constexpr FeatureBitset ImpliedFeaturesPKU = {};
	static constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {};
	static constexpr FeatureBitset ImpliedFeaturesPRFCHW = {};
	static constexpr FeatureBitset ImpliedFeaturesPTWRITE = {};
	static constexpr FeatureBitset ImpliedFeaturesRDPID = {};
	static constexpr FeatureBitset ImpliedFeaturesRDRND = {};
	static constexpr FeatureBitset ImpliedFeaturesRDSEED = {};
	static constexpr FeatureBitset ImpliedFeaturesRTM = {};
	static constexpr FeatureBitset ImpliedFeaturesSAHF = {};
	static constexpr FeatureBitset ImpliedFeaturesSERIALIZE = {};
	static constexpr FeatureBitset ImpliedFeaturesSGX = {};
	static constexpr FeatureBitset ImpliedFeaturesSHSTK = {};
	static constexpr FeatureBitset ImpliedFeaturesTBM = {};
	static constexpr FeatureBitset ImpliedFeaturesTSXLDTRK = {};
	static constexpr FeatureBitset ImpliedFeaturesWAITPKG = {};
	static constexpr FeatureBitset ImpliedFeaturesWBNOINVD = {};
	static constexpr FeatureBitset ImpliedFeaturesVZEROUPPER = {};
	static constexpr FeatureBitset ImpliedFeaturesX87 = {};
	static constexpr FeatureBitset ImpliedFeaturesXSAVE = {};

	// Not really CPU features, but need to be in the table because clang uses
	// target features to communicate them to the backend.
	static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_EXTERNAL_THUNK = {};
	static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_BRANCHES = {};
	static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_CALLS = {};
	static constexpr FeatureBitset ImpliedFeaturesLVI_CFI = {};
	static constexpr FeatureBitset ImpliedFeaturesLVI_LOAD_HARDENING = {};

	// XSAVE features are dependent on basic XSAVE.
	static constexpr FeatureBitset ImpliedFeaturesXSAVEC = FeatureXSAVE;
	static constexpr FeatureBitset ImpliedFeaturesXSAVEOPT = FeatureXSAVE;
	static constexpr FeatureBitset ImpliedFeaturesXSAVES = FeatureXSAVE;

	// MMX->3DNOW->3DNOWA chain.
	static constexpr FeatureBitset ImpliedFeaturesMMX = {};
	static constexpr FeatureBitset ImpliedFeatures3DNOW = FeatureMMX;
	static constexpr FeatureBitset ImpliedFeatures3DNOWA = Feature3DNOW;

	// SSE/AVX/AVX512F chain.
	static constexpr FeatureBitset ImpliedFeaturesSSE = {};
	static constexpr FeatureBitset ImpliedFeaturesSSE2 = FeatureSSE;
	static constexpr FeatureBitset ImpliedFeaturesSSE3 = FeatureSSE2;
	static constexpr FeatureBitset ImpliedFeaturesSSSE3 = FeatureSSE3;
	static constexpr FeatureBitset ImpliedFeaturesSSE4_1 = FeatureSSSE3;
	static constexpr FeatureBitset ImpliedFeaturesSSE4_2 = FeatureSSE4_1;
	static constexpr FeatureBitset ImpliedFeaturesAVX = FeatureSSE4_2;
	static constexpr FeatureBitset ImpliedFeaturesAVX2 = FeatureAVX;
	static constexpr FeatureBitset ImpliedFeaturesAVX512F =
	FeatureAVX2 \| FeatureF16C \| FeatureFMA;

	// Vector extensions that build on SSE or AVX.
	static constexpr FeatureBitset ImpliedFeaturesAES = FeatureSSE2;
	static constexpr FeatureBitset ImpliedFeaturesF16C = FeatureAVX;
	static constexpr FeatureBitset ImpliedFeaturesFMA = FeatureAVX;
	static constexpr FeatureBitset ImpliedFeaturesGFNI = FeatureSSE2;
	static constexpr FeatureBitset ImpliedFeaturesPCLMUL = FeatureSSE2;
	static constexpr FeatureBitset ImpliedFeaturesSHA = FeatureSSE2;
	static constexpr FeatureBitset ImpliedFeaturesVAES = FeatureAES \| FeatureAVX;
	static constexpr FeatureBitset ImpliedFeaturesVPCLMULQDQ =
	FeatureAVX \| FeaturePCLMUL;

	// AVX512 features.
	static constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512BW = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512DQ = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512ER = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512PF = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;

	static constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
	static constexpr FeatureBitset ImpliedFeaturesAVX512BITALG = FeatureAVX512BW;
	static constexpr FeatureBitset ImpliedFeaturesAVX512IFMA = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512VNNI = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512VPOPCNTDQ = FeatureAVX512F;
	static constexpr FeatureBitset ImpliedFeaturesAVX512VBMI = FeatureAVX512BW;
	static constexpr FeatureBitset ImpliedFeaturesAVX512VBMI2 = FeatureAVX512BW;
	static constexpr FeatureBitset ImpliedFeaturesAVX512VP2INTERSECT =
	FeatureAVX512F;

	// FIXME: These two aren't really implemented and just exist in the feature
	// list for __builtin_cpu_supports. So omit their dependencies.
	static constexpr FeatureBitset ImpliedFeaturesAVX5124FMAPS = {};
	static constexpr FeatureBitset ImpliedFeaturesAVX5124VNNIW = {};

	// SSE4_A->FMA4->XOP chain.
	static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSSE3;
	static constexpr FeatureBitset ImpliedFeaturesFMA4 = FeatureAVX \| FeatureSSE4_A;
	static constexpr FeatureBitset ImpliedFeaturesXOP = FeatureFMA4;

	// AMX Features
	static constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {};
	static constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
	static constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;

	static constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
	#define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
	#include "llvm/Support/X86TargetParser.def"
	};

	// Convert the set bits in FeatureBitset to a list of strings.
	static void getFeatureBitsAsStrings(const FeatureBitset &Bits,
	SmallVectorImpl<StringRef> &Features) {
	for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
	if (Bits[i] && !FeatureInfos[i].Name.empty())
	Features.push_back(FeatureInfos[i].Name);
	}

	void llvm::X86::getFeaturesForCPU(StringRef CPU,
	SmallVectorImpl<StringRef> &EnabledFeatures) {
	auto I = llvm::find_if(Processors,
	[&](const ProcInfo &P) { return P.Name == CPU; });
	assert(I != std::end(Processors) && "Processor not found!");

	FeatureBitset Bits = I->Features;

	// Remove the 64-bit feature which we only use to validate if a CPU can
	// be used with 64-bit mode.
	Bits &= ~Feature64BIT;

	// Add the string version of all set bits.
	getFeatureBitsAsStrings(Bits, EnabledFeatures);
	}

	// For each feature that is (transitively) implied by this feature, set it.
	static void getImpliedEnabledFeatures(FeatureBitset &Bits,
	const FeatureBitset &Implies) {
	+ // Fast path: Implies is often empty.
	+ if (!Implies.any())
	+ return;
	+ FeatureBitset Prev;
	Bits \|= Implies;
	- for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i) {
	- if (Implies[i])
	- getImpliedEnabledFeatures(Bits, FeatureInfos[i].ImpliedFeatures);
	- }
	+ do {
	+ Prev = Bits;
	+ for (unsigned i = CPU_FEATURE_MAX; i;)
	+ if (Bits[--i])
	+ Bits \|= FeatureInfos[i].ImpliedFeatures;
	+ } while (Prev != Bits);
	}

	/// Create bit vector of features that are implied disabled if the feature
	/// passed in Value is disabled.
	static void getImpliedDisabledFeatures(FeatureBitset &Bits, unsigned Value) {
	// Check all features looking for any dependent on this feature. If we find
	// one, mark it and recursively find any feature that depend on it.
	- for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i) {
	- if (FeatureInfos[i].ImpliedFeatures[Value]) {
	- Bits.set(i);
	- getImpliedDisabledFeatures(Bits, i);
	- }
	- }
	+ FeatureBitset Prev;
	+ Bits.set(Value);
	+ do {
	+ Prev = Bits;
	+ for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
	+ if ((FeatureInfos[i].ImpliedFeatures & Bits).any())
	+ Bits.set(i);
	+ } while (Prev != Bits);
	}

	void llvm::X86::getImpliedFeatures(
	StringRef Feature, bool Enabled,
	SmallVectorImpl<StringRef> &ImpliedFeatures) {
	auto I = llvm::find_if(
	FeatureInfos, [&](const FeatureInfo &FI) { return FI.Name == Feature; });
	if (I == std::end(FeatureInfos)) {
	// FIXME: This shouldn't happen, but may not have all features in the table
	// yet.
	return;
	}

	FeatureBitset ImpliedBits;
	if (Enabled)
	getImpliedEnabledFeatures(ImpliedBits, I->ImpliedFeatures);
	else
	getImpliedDisabledFeatures(ImpliedBits,
	std::distance(std::begin(FeatureInfos), I));

	// Convert all the found bits into strings.
	getFeatureBitsAsStrings(ImpliedBits, ImpliedFeatures);
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	index 4789a9f02937..83653dcbb8cf 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	@@ -1,3139 +1,3255 @@
	//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------- C++ --====//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of TargetFrameLowering class.
	//
	// On AArch64, stack frames are structured as follows:
	//
	// The stack grows downward.
	//
	// All of the individual frame areas on the frame below are optional, i.e. it's
	// possible to create a function so that the particular area isn't present
	// in the frame.
	//
	// At function entry, the "frame" looks as follows:
	//
	// \| \| Higher address
	// \|-----------------------------------\|
	// \| \|
	// \| arguments passed on the stack \|
	// \| \|
	// \|-----------------------------------\| <- sp
	// \| \| Lower address
	//
	//
	// After the prologue has run, the frame has the following general structure.
	// Note that this doesn't depict the case where a red-zone is used. Also,
	// technically the last frame area (VLAs) doesn't get created until in the
	// main function body, after the prologue is run. However, it's depicted here
	// for completeness.
	//
	// \| \| Higher address
	// \|-----------------------------------\|
	// \| \|
	// \| arguments passed on the stack \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| (Win64 only) varargs from reg \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| callee-saved gpr registers \| <--.
	// \| \| \| On Darwin platforms these
	// \|- - - - - - - - - - - - - - - - - -\| \| callee saves are swapped,
	// \| \| \| (frame record first)
	// \| prev_fp, prev_lr \| <--'
	// \| (a.k.a. "frame record") \|
	// \|-----------------------------------\| <- fp(=x29)
	// \| \|
	// \| callee-saved fp/simd/SVE regs \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| SVE stack objects \|
	// \| \|
	// \|-----------------------------------\|
	// \|.empty.space.to.make.part.below....\|
	// \|.aligned.in.case.it.needs.more.than\| (size of this area is unknown at
	// \|.the.standard.16-byte.alignment....\| compile time; if present)
	// \|-----------------------------------\|
	// \| \|
	// \| local variables of fixed size \|
	// \| including spill slots \|
	// \|-----------------------------------\| <- bp(not defined by ABI,
	// \|.variable-sized.local.variables....\| LLVM chooses X19)
	// \|.(VLAs)............................\| (size of this area is unknown at
	// \|...................................\| compile time)
	// \|-----------------------------------\| <- sp
	// \| \| Lower address
	//
	//
	// To access the data in a frame, at-compile time, a constant offset must be
	// computable from one of the pointers (fp, bp, sp) to access it. The size
	// of the areas with a dotted background cannot be computed at compile-time
	// if they are present, making it required to have all three of fp, bp and
	// sp to be set up to be able to access all contents in the frame areas,
	// assuming all of the frame areas are non-empty.
	//
	// For most functions, some of the frame areas are empty. For those functions,
	// it may not be necessary to set up fp or bp:
	// * A base pointer is definitely needed when there are both VLAs and local
	// variables with more-than-default alignment requirements.
	// * A frame pointer is definitely needed when there are local variables with
	// more-than-default alignment requirements.
	//
	// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
	// callee-saved area, since the unwind encoding does not allow for encoding
	// this dynamically and existing tools depend on this layout. For other
	// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
	// area to allow SVE stack objects (allocated directly below the callee-saves,
	// if available) to be accessed directly from the framepointer.
	// The SVE spill/fill instructions have VL-scaled addressing modes such
	// as:
	// ldr z8, [fp, #-7 mul vl]
	// For SVE the size of the vector length (VL) is not known at compile-time, so
	// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
	// layout, we don't need to add an unscaled offset to the framepointer before
	// accessing the SVE object in the frame.
	//
	// In some cases when a base pointer is not strictly needed, it is generated
	// anyway when offsets from the frame pointer to access local variables become
	// so large that the offset can't be encoded in the immediate fields of loads
	// or stores.
	//
	// FIXME: also explain the redzone concept.
	// FIXME: also explain the concept of reserved call frames.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64FrameLowering.h"
	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64StackOffset.h"
	#include "AArch64Subtarget.h"
	#include "AArch64TargetMachine.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/ScopeExit.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	+#include "llvm/Support/LEB128.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "frame-info"

	static cl::opt<bool> EnableRedZone("aarch64-redzone",
	cl::desc("enable use of redzone on AArch64"),
	cl::init(false), cl::Hidden);

	static cl::opt<bool>
	ReverseCSRRestoreSeq("reverse-csr-restore-seq",
	cl::desc("reverse the CSR restore sequence"),
	cl::init(false), cl::Hidden);

	static cl::opt<bool> StackTaggingMergeSetTag(
	"stack-tagging-merge-settag",
	cl::desc("merge settag instruction in function epilog"), cl::init(true),
	cl::Hidden);

	STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");

	/// Returns the argument pop size.
	static uint64_t getArgumentPopSize(MachineFunction &MF,
	MachineBasicBlock &MBB) {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	bool IsTailCallReturn = false;
	if (MBB.end() != MBBI) {
	unsigned RetOpcode = MBBI->getOpcode();
	IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi \|\|
	RetOpcode == AArch64::TCRETURNri \|\|
	RetOpcode == AArch64::TCRETURNriBTI;
	}
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();

	uint64_t ArgumentPopSize = 0;
	if (IsTailCallReturn) {
	MachineOperand &StackAdjust = MBBI->getOperand(1);

	// For a tail-call in a callee-pops-arguments environment, some or all of
	// the stack may actually be in use for the call's arguments, this is
	// calculated during LowerCall and consumed here...
	ArgumentPopSize = StackAdjust.getImm();
	} else {
	// ... otherwise the amount to pop is all of the argument space,
	// conveniently stored in the MachineFunctionInfo by
	// LowerFormalArguments. This will, of course, be zero for the C calling
	// convention.
	ArgumentPopSize = AFI->getArgumentStackToRestore();
	}

	return ArgumentPopSize;
	}

	/// This is the biggest offset to the stack pointer we can encode in aarch64
	/// instructions (without using a separate calculation and a temp register).
	/// Note that the exception here are vector stores/loads which cannot encode any
	/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
	static const unsigned DefaultSafeSPDisplacement = 255;

	/// Look at each instruction that references stack frames and return the stack
	/// size limit beyond which some of these instructions will require a scratch
	/// register during their expansion later.
	static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
	// FIXME: For now, just conservatively guestimate based on unscaled indexing
	// range. We'll end up allocating an unnecessary spill slot a lot, but
	// realistically that's not a big deal at this stage of the game.
	for (MachineBasicBlock &MBB : MF) {
	for (MachineInstr &MI : MBB) {
	if (MI.isDebugInstr() \|\| MI.isPseudo() \|\|
	MI.getOpcode() == AArch64::ADDXri \|\|
	MI.getOpcode() == AArch64::ADDSXri)
	continue;

	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isFI())
	continue;

	StackOffset Offset;
	if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
	AArch64FrameOffsetCannotUpdate)
	return 0;
	}
	}
	}
	return DefaultSafeSPDisplacement;
	}

	TargetStackID::Value
	AArch64FrameLowering::getStackIDForScalableVectors() const {
	return TargetStackID::SVEVector;
	}

	/// Returns the size of the fixed object area (allocated next to sp on entry)
	/// On Win64 this may include a var args area and an UnwindHelp object for EH.
	static unsigned getFixedObjectSize(const MachineFunction &MF,
	const AArch64FunctionInfo *AFI, bool IsWin64,
	bool IsFunclet) {
	if (!IsWin64 \|\| IsFunclet) {
	// Only Win64 uses fixed objects, and then only for the function (not
	// funclets)
	return 0;
	} else {
	// Var args are stored here in the primary function.
	const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
	// To support EH funclets we allocate an UnwindHelp object
	const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
	return alignTo(VarArgsArea + UnwindHelpObject, 16);
	}
	}

	/// Returns the size of the entire SVE stackframe (calleesaves + spills).
	static StackOffset getSVEStackSize(const MachineFunction &MF) {
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
	}

	bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
	if (!EnableRedZone)
	return false;
	// Don't use the red zone if the function explicitly asks us not to.
	// This is typically used for kernel code.
	if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
	return false;

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	uint64_t NumBytes = AFI->getLocalStackSize();

	return !(MFI.hasCalls() \|\| hasFP(MF) \|\| NumBytes > 128 \|\|
	getSVEStackSize(MF));
	}

	/// hasFP - Return true if the specified function should have a dedicated frame
	/// pointer register.
	bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
	// Win64 EH requires a frame pointer if funclets are present, as the locals
	// are accessed off the frame pointer in both the parent function and the
	// funclets.
	if (MF.hasEHFunclets())
	return true;
	// Retain behavior of always omitting the FP for leaf functions when possible.
	if (MF.getTarget().Options.DisableFramePointerElim(MF))
	return true;
	if (MFI.hasVarSizedObjects() \|\| MFI.isFrameAddressTaken() \|\|
	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
	RegInfo->needsStackRealignment(MF))
	return true;
	// With large callframes around we may need to use FP to access the scavenging
	// emergency spillslot.
	//
	// Unfortunately some calls to hasFP() like machine verifier ->
	// getReservedReg() -> hasFP in the middle of global isel are too early
	// to know the max call frame size. Hopefully conservatively returning "true"
	// in those cases is fine.
	// DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
	if (!MFI.isMaxCallFrameSizeComputed() \|\|
	MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
	return true;

	return false;
	}

	/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
	/// not required, we reserve argument space for call sites in the function
	/// immediately on entry to the current function. This eliminates the need for
	/// add/sub sp brackets around call sites. Returns true if the call frame is
	/// included as part of the stack frame.
	bool
	AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	return !MF.getFrameInfo().hasVarSizedObjects();
	}

	MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
	MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	const AArch64InstrInfo *TII =
	static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
	DebugLoc DL = I->getDebugLoc();
	unsigned Opc = I->getOpcode();
	bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
	uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;

	if (!hasReservedCallFrame(MF)) {
	int64_t Amount = I->getOperand(0).getImm();
	Amount = alignTo(Amount, getStackAlign());
	if (!IsDestroy)
	Amount = -Amount;

	// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
	// doesn't have to pop anything), then the first operand will be zero too so
	// this adjustment is a no-op.
	if (CalleePopAmount == 0) {
	// FIXME: in-function stack adjustment for calls is limited to 24-bits
	// because there's no guaranteed temporary register available.
	//
	// ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
	// 1) For offset <= 12-bit, we use LSL #0
	// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
	// LSL #0, and the other uses LSL #12.
	//
	// Most call frames will be allocated at the start of a function so
	// this is OK, but it is a limitation that needs dealing with.
	assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
	emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
	TII);
	}
	} else if (CalleePopAmount != 0) {
	// If the calling convention demands that the callee pops arguments from the
	// stack, we want to add it back if we have a reserved call frame.
	assert(CalleePopAmount < 0xffffff && "call frame too large");
	emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
	{-(int64_t)CalleePopAmount, MVT::i8}, TII);
	}
	return MBB.erase(I);
	}

	static bool ShouldSignReturnAddress(MachineFunction &MF) {
	// The function should be signed in the following situations:
	// - sign-return-address=all
	// - sign-return-address=non-leaf and the functions spills the LR

	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute("sign-return-address"))
	return false;

	StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
	if (Scope.equals("none"))
	return false;

	if (Scope.equals("all"))
	return true;

	assert(Scope.equals("non-leaf") && "Expected all, none or non-leaf");

	for (const auto &Info : MF.getFrameInfo().getCalleeSavedInfo())
	if (Info.getReg() == AArch64::LR)
	return true;

	return false;
	}

	+// Convenience function to create a DWARF expression for
	+// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
	+static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
	+ int NumBytes, int NumVGScaledBytes, unsigned VG,
	+ llvm::raw_string_ostream &Comment) {
	+ uint8_t buffer[16];
	+
	+ if (NumBytes) {
	+ Expr.push_back(dwarf::DW_OP_consts);
	+ Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
	+ Expr.push_back((uint8_t)dwarf::DW_OP_plus);
	+ Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
	+ }
	+
	+ if (NumVGScaledBytes) {
	+ Expr.push_back((uint8_t)dwarf::DW_OP_consts);
	+ Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
	+
	+ Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
	+ Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
	+ Expr.push_back(0);
	+
	+ Expr.push_back((uint8_t)dwarf::DW_OP_mul);
	+ Expr.push_back((uint8_t)dwarf::DW_OP_plus);
	+
	+ Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
	+ << std::abs(NumVGScaledBytes) << " * VG";
	+ }
	+}
	+
	+// Creates an MCCFIInstruction:
	+// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
	+MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
	+ const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
	+ int64_t NumBytes, NumVGScaledBytes;
	+ OffsetFromSP.getForDwarfOffset(NumBytes, NumVGScaledBytes);
	+
	+ std::string CommentBuffer = "sp";
	+ llvm::raw_string_ostream Comment(CommentBuffer);
	+
	+ // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
	+ SmallString<64> Expr;
	+ Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /SP/ 31));
	+ Expr.push_back(0);
	+ appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
	+ TRI.getDwarfRegNum(AArch64::VG, true), Comment);
	+
	+ // Wrap this into DW_CFA_def_cfa.
	+ SmallString<64> DefCfaExpr;
	+ DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
	+ uint8_t buffer[16];
	+ DefCfaExpr.append(buffer,
	+ buffer + encodeULEB128(Expr.size(), buffer));
	+ DefCfaExpr.append(Expr.str());
	+ return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
	+ Comment.str());
	+}
	+
	+MCCFIInstruction AArch64FrameLowering::createCfaOffset(
	+ const TargetRegisterInfo &TRI, unsigned Reg,
	+ const StackOffset &OffsetFromDefCFA) const {
	+ int64_t NumBytes, NumVGScaledBytes;
	+ OffsetFromDefCFA.getForDwarfOffset(NumBytes, NumVGScaledBytes);
	+
	+ unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
	+
	+ // Non-scalable offsets can use DW_CFA_offset directly.
	+ if (!NumVGScaledBytes)
	+ return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
	+
	+ std::string CommentBuffer;
	+ llvm::raw_string_ostream Comment(CommentBuffer);
	+ Comment << printReg(Reg, &TRI) << " @ cfa";
	+
	+ // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
	+ SmallString<64> OffsetExpr;
	+ appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
	+ TRI.getDwarfRegNum(AArch64::VG, true), Comment);
	+
	+ // Wrap this into DW_CFA_expression
	+ SmallString<64> CfaExpr;
	+ CfaExpr.push_back(dwarf::DW_CFA_expression);
	+ uint8_t buffer[16];
	+ CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
	+ CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
	+ CfaExpr.append(OffsetExpr.str());
	+
	+ return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
	+}
	+
	void AArch64FrameLowering::emitCalleeSavedFrameMoves(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetSubtargetInfo &STI = MF.getSubtarget();
	- const MCRegisterInfo *MRI = STI.getRegisterInfo();
	+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	DebugLoc DL = MBB.findDebugLoc(MBBI);

	// Add callee saved registers to move list.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	if (CSI.empty())
	return;

	for (const auto &Info : CSI) {
	unsigned Reg = Info.getReg();
	- int64_t Offset =
	- MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
	- unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
	- unsigned CFIIndex = MF.addFrameInst(
	- MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
	+
	+ // Not all unwinders may know about SVE registers, so assume the lowest
	+ // common demoninator.
	+ unsigned NewReg;
	+ if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
	+ Reg = NewReg;
	+ else
	+ continue;
	+
	+ StackOffset Offset;
	+ if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::SVEVector) {
	+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	+ Offset = StackOffset(MFI.getObjectOffset(Info.getFrameIdx()), MVT::nxv1i8) -
	+ StackOffset(AFI->getCalleeSavedStackSize(MFI), MVT::i8);
	+ } else {
	+ Offset = {MFI.getObjectOffset(Info.getFrameIdx()) -
	+ getOffsetOfLocalArea(),
	+ MVT::i8};
	+ }
	+ unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}
	}

	// Find a scratch register that we can use at the start of the prologue to
	// re-align the stack pointer. We avoid using callee-save registers since they
	// may appear to be free when this is called from canUseAsPrologue (during
	// shrink wrapping), but then no longer be free when this is called from
	// emitPrologue.
	//
	// FIXME: This is a bit conservative, since in the above case we could use one
	// of the callee-save registers as a scratch temp to re-align the stack pointer,
	// but we would then have to make sure that we were in fact saving at least one
	// callee-save register in the prologue, which is additional complexity that
	// doesn't seem worth the benefit.
	static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
	MachineFunction *MF = MBB->getParent();

	// If MBB is an entry block, use X9 as the scratch register
	if (&MF->front() == MBB)
	return AArch64::X9;

	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
	LivePhysRegs LiveRegs(TRI);
	LiveRegs.addLiveIns(*MBB);

	// Mark callee saved registers as used so we will not choose them.
	const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (unsigned i = 0; CSRegs[i]; ++i)
	LiveRegs.addReg(CSRegs[i]);

	// Prefer X9 since it was historically used for the prologue scratch reg.
	const MachineRegisterInfo &MRI = MF->getRegInfo();
	if (LiveRegs.available(MRI, AArch64::X9))
	return AArch64::X9;

	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (LiveRegs.available(MRI, Reg))
	return Reg;
	}
	return AArch64::NoRegister;
	}

	bool AArch64FrameLowering::canUseAsPrologue(
	const MachineBasicBlock &MBB) const {
	const MachineFunction *MF = MBB.getParent();
	MachineBasicBlock TmpMBB = const_cast<MachineBasicBlock >(&MBB);
	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	// Don't need a scratch register if we're not going to re-align the stack.
	if (!RegInfo->needsStackRealignment(*MF))
	return true;
	// Otherwise, we can use any block as long as it has a scratch register
	// available.
	return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
	}

	static bool windowsRequiresStackProbe(MachineFunction &MF,
	uint64_t StackSizeInBytes) {
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	if (!Subtarget.isTargetWindows())
	return false;
	const Function &F = MF.getFunction();
	// TODO: When implementing stack protectors, take that into account
	// for the probe threshold.
	unsigned StackProbeSize = 4096;
	if (F.hasFnAttribute("stack-probe-size"))
	F.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);
	return (StackSizeInBytes >= StackProbeSize) &&
	!F.hasFnAttribute("no-stack-arg-probe");
	}

	bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
	MachineFunction &MF, uint64_t StackBumpBytes) const {
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	if (AFI->getLocalStackSize() == 0)
	return false;

	// 512 is the maximum immediate for stp/ldp that will be used for
	// callee-save save/restores
	if (StackBumpBytes >= 512 \|\| windowsRequiresStackProbe(MF, StackBumpBytes))
	return false;

	if (MFI.hasVarSizedObjects())
	return false;

	if (RegInfo->needsStackRealignment(MF))
	return false;

	// This isn't strictly necessary, but it simplifies things a bit since the
	// current RedZone handling code assumes the SP is adjusted by the
	// callee-save save/restore code.
	if (canUseRedZone(MF))
	return false;

	// When there is an SVE area on the stack, always allocate the
	// callee-saves and spills/locals separately.
	if (getSVEStackSize(MF))
	return false;

	return true;
	}

	bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
	MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
	if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
	return false;

	if (MBB.empty())
	return true;

	// Disable combined SP bump if the last instruction is an MTE tag store. It
	// is almost always better to merge SP adjustment into those instructions.
	MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
	MachineBasicBlock::iterator Begin = MBB.begin();
	while (LastI != Begin) {
	--LastI;
	if (LastI->isTransient())
	continue;
	if (!LastI->getFlag(MachineInstr::FrameDestroy))
	break;
	}
	switch (LastI->getOpcode()) {
	case AArch64::STGloop:
	case AArch64::STZGloop:
	case AArch64::STGOffset:
	case AArch64::STZGOffset:
	case AArch64::ST2GOffset:
	case AArch64::STZ2GOffset:
	return false;
	default:
	return true;
	}
	llvm_unreachable("unreachable");
	}

	// Given a load or a store instruction, generate an appropriate unwinding SEH
	// code on Windows.
	static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
	const TargetInstrInfo &TII,
	MachineInstr::MIFlag Flag) {
	unsigned Opc = MBBI->getOpcode();
	MachineBasicBlock *MBB = MBBI->getParent();
	MachineFunction &MF = *MBB->getParent();
	DebugLoc DL = MBBI->getDebugLoc();
	unsigned ImmIdx = MBBI->getNumOperands() - 1;
	int Imm = MBBI->getOperand(ImmIdx).getImm();
	MachineInstrBuilder MIB;
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	switch (Opc) {
	default:
	llvm_unreachable("No SEH Opcode for this instruction");
	case AArch64::LDPDpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STPDpre: {
	unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
	.addImm(Reg0)
	.addImm(Reg1)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::LDPXpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STPXpre: {
	Register Reg0 = MBBI->getOperand(1).getReg();
	Register Reg1 = MBBI->getOperand(2).getReg();
	if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	else
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
	.addImm(RegInfo->getSEHRegNum(Reg0))
	.addImm(RegInfo->getSEHRegNum(Reg1))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::LDRDpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STRDpre: {
	unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
	.addImm(Reg)
	.addImm(Imm)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::LDRXpost:
	Imm = -Imm;
	LLVM_FALLTHROUGH;
	case AArch64::STRXpre: {
	unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
	.addImm(Reg)
	.addImm(Imm)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STPDi:
	case AArch64::LDPDi: {
	unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
	unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
	.addImm(Reg0)
	.addImm(Reg1)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STPXi:
	case AArch64::LDPXi: {
	Register Reg0 = MBBI->getOperand(0).getReg();
	Register Reg1 = MBBI->getOperand(1).getReg();
	if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	else
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
	.addImm(RegInfo->getSEHRegNum(Reg0))
	.addImm(RegInfo->getSEHRegNum(Reg1))
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STRXui:
	case AArch64::LDRXui: {
	int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
	.addImm(Reg)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	case AArch64::STRDui:
	case AArch64::LDRDui: {
	unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
	MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
	.addImm(Reg)
	.addImm(Imm * 8)
	.setMIFlag(Flag);
	break;
	}
	}
	auto I = MBB->insertAfter(MBBI, MIB);
	return I;
	}

	// Fix up the SEH opcode associated with the save/restore instruction.
	static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
	unsigned LocalStackSize) {
	MachineOperand *ImmOpnd = nullptr;
	unsigned ImmIdx = MBBI->getNumOperands() - 1;
	switch (MBBI->getOpcode()) {
	default:
	llvm_unreachable("Fix the offset in the SEH instruction");
	case AArch64::SEH_SaveFPLR:
	case AArch64::SEH_SaveRegP:
	case AArch64::SEH_SaveReg:
	case AArch64::SEH_SaveFRegP:
	case AArch64::SEH_SaveFReg:
	ImmOpnd = &MBBI->getOperand(ImmIdx);
	break;
	}
	if (ImmOpnd)
	ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
	}

	// Convert callee-save register save/restore instruction to do stack pointer
	// decrement/increment to allocate/deallocate the callee-save stack area by
	// converting store/load to use pre/post increment version.
	static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
	bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
	// Ignore instructions that do not operate on SP, i.e. shadow call stack
	// instructions and associated CFI instruction.
	while (MBBI->getOpcode() == AArch64::STRXpost \|\|
	MBBI->getOpcode() == AArch64::LDRXpre \|\|
	MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
	if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
	assert(MBBI->getOperand(0).getReg() != AArch64::SP);
	++MBBI;
	}
	unsigned NewOpc;
	int Scale = 1;
	switch (MBBI->getOpcode()) {
	default:
	llvm_unreachable("Unexpected callee-save save/restore opcode!");
	case AArch64::STPXi:
	NewOpc = AArch64::STPXpre;
	Scale = 8;
	break;
	case AArch64::STPDi:
	NewOpc = AArch64::STPDpre;
	Scale = 8;
	break;
	case AArch64::STPQi:
	NewOpc = AArch64::STPQpre;
	Scale = 16;
	break;
	case AArch64::STRXui:
	NewOpc = AArch64::STRXpre;
	break;
	case AArch64::STRDui:
	NewOpc = AArch64::STRDpre;
	break;
	case AArch64::STRQui:
	NewOpc = AArch64::STRQpre;
	break;
	case AArch64::LDPXi:
	NewOpc = AArch64::LDPXpost;
	Scale = 8;
	break;
	case AArch64::LDPDi:
	NewOpc = AArch64::LDPDpost;
	Scale = 8;
	break;
	case AArch64::LDPQi:
	NewOpc = AArch64::LDPQpost;
	Scale = 16;
	break;
	case AArch64::LDRXui:
	NewOpc = AArch64::LDRXpost;
	break;
	case AArch64::LDRDui:
	NewOpc = AArch64::LDRDpost;
	break;
	case AArch64::LDRQui:
	NewOpc = AArch64::LDRQpost;
	break;
	}
	// Get rid of the SEH code associated with the old instruction.
	if (NeedsWinCFI) {
	auto SEH = std::next(MBBI);
	if (AArch64InstrInfo::isSEHInstruction(*SEH))
	SEH->eraseFromParent();
	}

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
	MIB.addReg(AArch64::SP, RegState::Define);

	// Copy all operands other than the immediate offset.
	unsigned OpndIdx = 0;
	for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
	++OpndIdx)
	MIB.add(MBBI->getOperand(OpndIdx));

	assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
	"Unexpected immediate offset in first/last callee-save save/restore "
	"instruction!");
	assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
	"Unexpected base register in callee-save save/restore instruction!");
	assert(CSStackSizeInc % Scale == 0);
	MIB.addImm(CSStackSizeInc / Scale);

	MIB.setMIFlags(MBBI->getFlags());
	MIB.setMemRefs(MBBI->memoperands());

	// Generate a new SEH code that corresponds to the new instruction.
	if (NeedsWinCFI) {
	*HasWinCFI = true;
	InsertSEH(MIB, TII,
	InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
	}

	return std::prev(MBB.erase(MBBI));
	}

	// Fixup callee-save register save/restore instructions to take into account
	// combined SP bump by adding the local stack size to the stack offsets.
	static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
	uint64_t LocalStackSize,
	bool NeedsWinCFI,
	bool *HasWinCFI) {
	if (AArch64InstrInfo::isSEHInstruction(MI))
	return;

	unsigned Opc = MI.getOpcode();

	// Ignore instructions that do not operate on SP, i.e. shadow call stack
	// instructions and associated CFI instruction.
	if (Opc == AArch64::STRXpost \|\| Opc == AArch64::LDRXpre \|\|
	Opc == AArch64::CFI_INSTRUCTION) {
	if (Opc != AArch64::CFI_INSTRUCTION)
	assert(MI.getOperand(0).getReg() != AArch64::SP);
	return;
	}

	unsigned Scale;
	switch (Opc) {
	case AArch64::STPXi:
	case AArch64::STRXui:
	case AArch64::STPDi:
	case AArch64::STRDui:
	case AArch64::LDPXi:
	case AArch64::LDRXui:
	case AArch64::LDPDi:
	case AArch64::LDRDui:
	Scale = 8;
	break;
	case AArch64::STPQi:
	case AArch64::STRQui:
	case AArch64::LDPQi:
	case AArch64::LDRQui:
	Scale = 16;
	break;
	default:
	llvm_unreachable("Unexpected callee-save save/restore opcode!");
	}

	unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
	assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
	"Unexpected base register in callee-save save/restore instruction!");
	// Last operand is immediate offset that needs fixing.
	MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
	// All generated opcodes have scaled offsets.
	assert(LocalStackSize % Scale == 0);
	OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);

	if (NeedsWinCFI) {
	*HasWinCFI = true;
	auto MBBI = std::next(MachineBasicBlock::iterator(MI));
	assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
	assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
	"Expecting a SEH instruction");
	fixupSEHOpcode(MBBI, LocalStackSize);
	}
	}

	static void adaptForLdStOpt(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator FirstSPPopI,
	MachineBasicBlock::iterator LastPopI) {
	// Sometimes (when we restore in the same order as we save), we can end up
	// with code like this:
	//
	// ldp x26, x25, [sp]
	// ldp x24, x23, [sp, #16]
	// ldp x22, x21, [sp, #32]
	// ldp x20, x19, [sp, #48]
	// add sp, sp, #64
	//
	// In this case, it is always better to put the first ldp at the end, so
	// that the load-store optimizer can run and merge the ldp and the add into
	// a post-index ldp.
	// If we managed to grab the first pop instruction, move it to the end.
	if (ReverseCSRRestoreSeq)
	MBB.splice(FirstSPPopI, &MBB, LastPopI);
	// We should end up with something like this now:
	//
	// ldp x24, x23, [sp, #16]
	// ldp x22, x21, [sp, #32]
	// ldp x20, x19, [sp, #48]
	// ldp x26, x25, [sp]
	// add sp, sp, #64
	//
	// and the load-store optimizer can merge the last two instructions into:
	//
	// ldp x26, x25, [sp], #64
	//
	}

	static bool ShouldSignWithAKey(MachineFunction &MF) {
	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute("sign-return-address-key"))
	return true;

	const StringRef Key =
	F.getFnAttribute("sign-return-address-key").getValueAsString();
	assert(Key.equals_lower("a_key") \|\| Key.equals_lower("b_key"));
	return Key.equals_lower("a_key");
	}

	static bool needsWinCFI(const MachineFunction &MF) {
	const Function &F = MF.getFunction();
	return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
	F.needsUnwindTableEntry();
	}

	static bool isTargetDarwin(const MachineFunction &MF) {
	return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
	}

	static bool isTargetWindows(const MachineFunction &MF) {
	return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
	}

	// Convenience function to determine whether I is an SVE callee save.
	static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
	switch (I->getOpcode()) {
	default:
	return false;
	case AArch64::STR_ZXI:
	case AArch64::STR_PXI:
	case AArch64::LDR_ZXI:
	case AArch64::LDR_PXI:
	return I->getFlag(MachineInstr::FrameSetup) \|\|
	I->getFlag(MachineInstr::FrameDestroy);
	}
	}

	void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.begin();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const Function &F = MF.getFunction();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineModuleInfo &MMI = MF.getMMI();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	bool needsFrameMoves =
	MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool HasFP = hasFP(MF);
	bool NeedsWinCFI = needsWinCFI(MF);
	bool HasWinCFI = false;
	auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });

	bool IsFunclet = MBB.isEHFuncletEntry();

	// At this point, we're going to decide whether or not the function uses a
	// redzone. In most cases, the function doesn't have a redzone so let's
	// assume that's false and set it to true in the case that there's a redzone.
	AFI->setHasRedZone(false);

	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc DL;

	if (ShouldSignReturnAddress(MF)) {
	if (ShouldSignWithAKey(MF))
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
	.setMIFlag(MachineInstr::FrameSetup);
	else {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	unsigned CFIIndex =
	MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
	return;

	// Set tagged base pointer to the bottom of the stack frame.
	// Ideally it should match SP value after prologue.
	AFI->setTaggedBasePointerOffset(MFI.getStackSize());

	const StackOffset &SVEStackSize = getSVEStackSize(MF);

	// getStackSize() includes all the locals in its size calculation. We don't
	// include these locals when computing the stack size of a funclet, as they
	// are allocated in the parent's stack frame and accessed via the frame
	// pointer from the funclet. We only save the callee saved registers in the
	// funclet, which are really the callee saved registers of the parent
	// function, including the funclet.
	int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
	: MFI.getStackSize();
	if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
	assert(!HasFP && "unexpected function without stack frame but with FP");
	assert(!SVEStackSize &&
	"unexpected function without stack frame but with SVE objects");
	// All of the stack allocation is for locals.
	AFI->setLocalStackSize(NumBytes);
	if (!NumBytes)
	return;
	// REDZONE: If the stack size is less than 128 bytes, we don't need
	// to actually allocate.
	if (canUseRedZone(MF)) {
	AFI->setHasRedZone(true);
	++NumRedZoneFunctions;
	} else {
	emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
	{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
	false, NeedsWinCFI, &HasWinCFI);
	if (!NeedsWinCFI && needsFrameMoves) {
	// Label used to tie together the PROLOG_LABEL and the MachineMoves.
	MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
	// Encode the stack size of the leaf function.
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	return;
	}

	bool IsWin64 =
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
	unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);

	auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
	// All of the remaining stack allocations are for locals.
	AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
	bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
	if (CombineSPBump) {
	assert(!SVEStackSize && "Cannot combine SP bump with SVE");
	emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
	{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
	NeedsWinCFI, &HasWinCFI);
	NumBytes = 0;
	} else if (PrologueSaveSize != 0) {
	MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
	MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
	NumBytes -= PrologueSaveSize;
	}
	assert(NumBytes >= 0 && "Negative stack allocation size!?");

	// Move past the saves of the callee-saved registers, fixing up the offsets
	// and pre-inc if we decided to combine the callee-save and local stack
	// pointer bump above.
	MachineBasicBlock::iterator End = MBB.end();
	while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
	!IsSVECalleeSave(MBBI)) {
	if (CombineSPBump)
	fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
	NeedsWinCFI, &HasWinCFI);
	++MBBI;
	}

	// For funclets the FP belongs to the containing function.
	if (!IsFunclet && HasFP) {
	// Only set up FP if we actually need to.
	int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;

	if (CombineSPBump)
	FPOffset += AFI->getLocalStackSize();

	// Issue sub fp, sp, FPOffset or
	// mov fp,sp when FPOffset is zero.
	// Note: All stores of callee-saved registers are marked as "FrameSetup".
	// This code marks the instruction(s) that set the FP also.
	emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
	{FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
	NeedsWinCFI, &HasWinCFI);
	}

	if (windowsRequiresStackProbe(MF, NumBytes)) {
	uint64_t NumWords = NumBytes >> 4;
	if (NeedsWinCFI) {
	HasWinCFI = true;
	// alloc_l can hold at most 256MB, so assume that NumBytes doesn't
	// exceed this amount. We need to move at most 2^24 - 1 into x15.
	// This is at most two instructions, MOVZ follwed by MOVK.
	// TODO: Fix to use multiple stack alloc unwind codes for stacks
	// exceeding 256MB in size.
	if (NumBytes >= (1 << 28))
	report_fatal_error("Stack size cannot exceed 256MB for stack "
	"unwinding purposes");

	uint32_t LowNumWords = NumWords & 0xFFFF;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
	.addImm(LowNumWords)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	if ((NumWords & 0xFFFF0000) != 0) {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
	.addReg(AArch64::X15)
	.addImm((NumWords & 0xFFFF0000) >> 16) // High half
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
	.setMIFlag(MachineInstr::FrameSetup);
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	} else {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
	.addImm(NumWords)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	switch (MF.getTarget().getCodeModel()) {
	case CodeModel::Tiny:
	case CodeModel::Small:
	case CodeModel::Medium:
	case CodeModel::Kernel:
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
	.addExternalSymbol("__chkstk")
	.addReg(AArch64::X15, RegState::Implicit)
	.addReg(AArch64::X16, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::X17, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	break;
	case CodeModel::Large:
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
	.addReg(AArch64::X16, RegState::Define)
	.addExternalSymbol("__chkstk")
	.addExternalSymbol("__chkstk")
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
	.addReg(AArch64::X16, RegState::Kill)
	.addReg(AArch64::X15, RegState::Implicit \| RegState::Define)
	.addReg(AArch64::X16, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::X17, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Define \| RegState::Dead)
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	break;
	}

	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
	.addReg(AArch64::SP, RegState::Kill)
	.addReg(AArch64::X15, RegState::Kill)
	.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
	.setMIFlags(MachineInstr::FrameSetup);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	NumBytes = 0;
	}

	StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
	MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;

	// Process the SVE callee-saves to determine what space needs to be
	// allocated.
	if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
	// Find callee save instructions in frame.
	CalleeSavesBegin = MBBI;
	assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
	while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
	++MBBI;
	CalleeSavesEnd = MBBI;

	AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};
	AllocateAfter = SVEStackSize - AllocateBefore;
	}

	// Allocate space for the callee saves (if any).
	emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
	-AllocateBefore, TII,
	MachineInstr::FrameSetup);

	// Finally allocate remaining SVE stack space.
	emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
	-AllocateAfter, TII,
	MachineInstr::FrameSetup);

	// Allocate space for the rest of the frame.
	if (NumBytes) {
	// Alignment is required for the parent frame, not the funclet
	const bool NeedsRealignment =
	!IsFunclet && RegInfo->needsStackRealignment(MF);
	unsigned scratchSPReg = AArch64::SP;

	if (NeedsRealignment) {
	scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
	assert(scratchSPReg != AArch64::NoRegister);
	}

	// If we're a leaf function, try using the red zone.
	if (!canUseRedZone(MF))
	// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
	// the correct value here, as NumBytes also includes padding bytes,
	// which shouldn't be counted here.
	emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
	{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
	false, NeedsWinCFI, &HasWinCFI);

	if (NeedsRealignment) {
	const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
	assert(NrBitsToZero > 1);
	assert(scratchSPReg != AArch64::SP);

	// SUB X9, SP, NumBytes
	// -- X9 is temporary register, so shouldn't contain any live data here,
	// -- free to use. This is already produced by emitFrameOffset above.
	// AND SP, X9, 0b11111...0000
	// The logical immediates have a non-trivial encoding. The following
	// formula computes the encoded immediate with all ones but
	// NrBitsToZero zero bits as least significant bits.
	uint32_t andMaskEncoded = (1 << 12) // = N
	\| ((64 - NrBitsToZero) << 6) // immr
	\| ((64 - NrBitsToZero - 1) << 0); // imms

	BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
	.addReg(scratchSPReg, RegState::Kill)
	.addImm(andMaskEncoded);
	AFI->setStackRealigned(true);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
	.addImm(NumBytes & andMaskEncoded)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}
	}

	// If we need a base pointer, set it up here. It's whatever the value of the
	// stack pointer is at this point. Any variable size objects will be allocated
	// after this, so we can still use the base pointer to reference locals.
	//
	// FIXME: Clarify FrameSetup flags here.
	// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
	// needed.
	// For funclets the BP belongs to the containing function.
	if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
	TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
	false);
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	// The very last FrameSetup instruction indicates the end of prologue. Emit a
	// SEH opcode indicating the prologue end.
	if (NeedsWinCFI && HasWinCFI) {
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// SEH funclets are passed the frame pointer in X1. If the parent
	// function uses the base register, then the base register is used
	// directly, and is not retrieved from X1.
	if (IsFunclet && F.hasPersonalityFn()) {
	EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
	if (isAsynchronousEHPersonality(Per)) {
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
	.addReg(AArch64::X1)
	.setMIFlag(MachineInstr::FrameSetup);
	MBB.addLiveIn(AArch64::X1);
	}
	}

	if (needsFrameMoves) {
	const DataLayout &TD = MF.getDataLayout();
	const int StackGrowth = isTargetDarwin(MF)
	? (2 * -TD.getPointerSize(0))
	: -AFI->getCalleeSavedStackSize();
	Register FramePtr = RegInfo->getFrameRegister(MF);
	// An example of the prologue:
	//
	// .globl __foo
	// .align 2
	// __foo:
	// Ltmp0:
	// .cfi_startproc
	// .cfi_personality 155, ___gxx_personality_v0
	// Leh_func_begin:
	// .cfi_lsda 16, Lexception33
	//
	// stp xa,bx, [sp, -#offset]!
	// ...
	// stp x28, x27, [sp, #offset-32]
	// stp fp, lr, [sp, #offset-16]
	// add fp, sp, #offset - 16
	// sub sp, sp, #1360
	//
	// The Stack:
	// +-------------------------------------------+
	// 10000 \| ........ \| ........ \| ........ \| ........ \|
	// 10004 \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	// 10008 \| ........ \| ........ \| ........ \| ........ \|
	// 1000c \| ........ \| ........ \| ........ \| ........ \|
	// +===========================================+
	// 10010 \| X28 Register \|
	// 10014 \| X28 Register \|
	// +-------------------------------------------+
	// 10018 \| X27 Register \|
	// 1001c \| X27 Register \|
	// +===========================================+
	// 10020 \| Frame Pointer \|
	// 10024 \| Frame Pointer \|
	// +-------------------------------------------+
	// 10028 \| Link Register \|
	// 1002c \| Link Register \|
	// +===========================================+
	// 10030 \| ........ \| ........ \| ........ \| ........ \|
	// 10034 \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	// 10038 \| ........ \| ........ \| ........ \| ........ \|
	// 1003c \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	//
	// [sp] = 10030 :: >>initial value<<
	// sp = 10020 :: stp fp, lr, [sp, #-16]!
	// fp = sp == 10020 :: mov fp, sp
	// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
	// sp == 10010 :: >>final value<<
	//
	// The frame pointer (w29) points to address 10020. If we use an offset of
	// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
	// for w27, and -32 for w28:
	//
	// Ltmp1:
	// .cfi_def_cfa w29, 16
	// Ltmp2:
	// .cfi_offset w30, -8
	// Ltmp3:
	// .cfi_offset w29, -16
	// Ltmp4:
	// .cfi_offset w27, -24
	// Ltmp5:
	// .cfi_offset w28, -32

	if (HasFP) {
	// Define the current CFA rule to use the provided FP.
	unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	} else {
	- // Encode the stack size of the leaf function.
	- unsigned CFIIndex = MF.addFrameInst(
	- MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
	+ unsigned CFIIndex;
	+ if (SVEStackSize) {
	+ const TargetSubtargetInfo &STI = MF.getSubtarget();
	+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
	+ StackOffset TotalSize =
	+ SVEStackSize + StackOffset((int64_t)MFI.getStackSize(), MVT::i8);
	+ CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
	+ } else {
	+ // Encode the stack size of the leaf function.
	+ CFIIndex = MF.addFrameInst(
	+ MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
	+ }
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	// Now emit the moves for whatever callee saved regs we have (including FP,
	// LR if those are saved).
	emitCalleeSavedFrameMoves(MBB, MBBI);
	}
	}

	static void InsertReturnAddressAuth(MachineFunction &MF,
	MachineBasicBlock &MBB) {
	if (!ShouldSignReturnAddress(MF))
	return;
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
	DebugLoc DL;
	if (MBBI != MBB.end())
	DL = MBBI->getDebugLoc();

	// The AUTIASP instruction assembles to a hint instruction before v8.3a so
	// this instruction can safely used for any v8a architecture.
	// From v8.3a onwards there are optimised authenticate LR and return
	// instructions, namely RETA{A,B}, that can be used instead.
	if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
	MBBI->getOpcode() == AArch64::RET_ReallyLR) {
	BuildMI(MBB, MBBI, DL,
	TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
	.copyImplicitOps(*MBBI);
	MBB.erase(MBBI);
	} else {
	BuildMI(
	MBB, MBBI, DL,
	TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	}

	static bool isFuncletReturnInstr(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	case AArch64::CATCHRET:
	case AArch64::CLEANUPRET:
	return true;
	}
	}

	void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL;
	bool NeedsWinCFI = needsWinCFI(MF);
	bool HasWinCFI = false;
	bool IsFunclet = false;
	auto WinCFI = make_scope_exit([&]() {
	if (!MF.hasWinCFI())
	MF.setHasWinCFI(HasWinCFI);
	});

	if (MBB.end() != MBBI) {
	DL = MBBI->getDebugLoc();
	IsFunclet = isFuncletReturnInstr(*MBBI);
	}

	int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
	: MFI.getStackSize();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();

	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
	return;

	// Initial and residual are named for consistency with the prologue. Note that
	// in the epilogue, the residual adjustment is executed first.
	uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);

	// The stack frame should be like below,
	//
	// ---------------------- ---
	// \| \| \|
	// \| BytesInStackArgArea\| CalleeArgStackSize
	// \| (NumReusableBytes) \| (of tail call)
	// \| \| ---
	// \| \| \|
	// ---------------------\| --- \|
	// \| \| \| \|
	// \| CalleeSavedReg \| \| \|
	// \| (CalleeSavedStackSize)\| \| \|
	// \| \| \| \|
	// ---------------------\| \| NumBytes
	// \| \| StackSize (StackAdjustUp)
	// \| LocalStackSize \| \| \|
	// \| (covering callee \| \| \|
	// \| args) \| \| \|
	// \| \| \| \|
	// ---------------------- --- ---
	//
	// So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
	// = StackSize + ArgumentPopSize
	//
	// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
	// it as the 2nd argument of AArch64ISD::TC_RETURN.

	auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });

	bool IsWin64 =
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
	unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);

	uint64_t AfterCSRPopSize = ArgumentPopSize;
	auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
	// We cannot rely on the local stack size set in emitPrologue if the function
	// has funclets, as funclets have different local stack size requirements, and
	// the current value set in emitPrologue may be that of the containing
	// function.
	if (MF.hasEHFunclets())
	AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
	bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
	// Assume we can't combine the last pop with the sp restore.

	if (!CombineSPBump && PrologueSaveSize != 0) {
	MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
	while (AArch64InstrInfo::isSEHInstruction(*Pop))
	Pop = std::prev(Pop);
	// Converting the last ldp to a post-index ldp is valid only if the last
	// ldp's offset is 0.
	const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
	// If the offset is 0, convert it to a post-index ldp.
	if (OffsetOp.getImm() == 0)
	convertCalleeSaveRestoreToSPPrePostIncDec(
	MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
	else {
	// If not, make sure to emit an add after the last ldp.
	// We're doing this by transfering the size to be restored from the
	// adjustment before the CSR pops to the adjustment after the CSR
	// pops.
	AfterCSRPopSize += PrologueSaveSize;
	}
	}

	// Move past the restores of the callee-saved registers.
	// If we plan on combining the sp bump of the local stack size and the callee
	// save stack size, we might need to adjust the CSR save and restore offsets.
	MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
	MachineBasicBlock::iterator Begin = MBB.begin();
	while (LastPopI != Begin) {
	--LastPopI;
	if (!LastPopI->getFlag(MachineInstr::FrameDestroy) \|\|
	IsSVECalleeSave(LastPopI)) {
	++LastPopI;
	break;
	} else if (CombineSPBump)
	fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
	NeedsWinCFI, &HasWinCFI);
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
	.setMIFlag(MachineInstr::FrameDestroy);
	}

	const StackOffset &SVEStackSize = getSVEStackSize(MF);

	// If there is a single SP update, insert it before the ret and we're done.
	if (CombineSPBump) {
	assert(!SVEStackSize && "Cannot combine SP bump with SVE");
	emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
	{NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
	MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
	if (NeedsWinCFI && HasWinCFI)
	BuildMI(MBB, MBB.getFirstTerminator(), DL,
	TII->get(AArch64::SEH_EpilogEnd))
	.setMIFlag(MachineInstr::FrameDestroy);
	return;
	}

	NumBytes -= PrologueSaveSize;
	assert(NumBytes >= 0 && "Negative stack allocation size!?");

	// Process the SVE callee-saves to determine what space needs to be
	// deallocated.
	StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
	MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
	if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
	RestoreBegin = std::prev(RestoreEnd);;
	while (IsSVECalleeSave(RestoreBegin) &&
	RestoreBegin != MBB.begin())
	--RestoreBegin;
	++RestoreBegin;

	assert(IsSVECalleeSave(RestoreBegin) &&
	IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");

	StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8};
	DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
	DeallocateAfter = CalleeSavedSizeAsOffset;
	}

	// Deallocate the SVE area.
	if (SVEStackSize) {
	if (AFI->isStackRealigned()) {
	if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
	// Set SP to start of SVE callee-save area from which they can
	// be reloaded. The code below will deallocate the stack space
	// space by moving FP -> SP.
	emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
	{-CalleeSavedSize, MVT::nxv1i8}, TII,
	MachineInstr::FrameDestroy);
	} else {
	if (AFI->getSVECalleeSavedStackSize()) {
	// Deallocate the non-SVE locals first before we can deallocate (and
	// restore callee saves) from the SVE area.
	emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
	{NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
	NumBytes = 0;
	}

	emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
	DeallocateBefore, TII, MachineInstr::FrameDestroy);

	emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
	DeallocateAfter, TII, MachineInstr::FrameDestroy);
	}
	}

	if (!hasFP(MF)) {
	bool RedZone = canUseRedZone(MF);
	// If this was a redzone leaf function, we don't need to restore the
	// stack pointer (but we may need to pop stack args for fastcc).
	if (RedZone && AfterCSRPopSize == 0)
	return;

	bool NoCalleeSaveRestore = PrologueSaveSize == 0;
	int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
	if (NoCalleeSaveRestore)
	StackRestoreBytes += AfterCSRPopSize;

	// If we were able to combine the local stack pop with the argument pop,
	// then we're done.
	bool Done = NoCalleeSaveRestore \|\| AfterCSRPopSize == 0;

	// If we're done after this, make sure to help the load store optimizer.
	if (Done)
	adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);

	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
	{StackRestoreBytes, MVT::i8}, TII,
	MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
	if (Done) {
	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBB.getFirstTerminator(), DL,
	TII->get(AArch64::SEH_EpilogEnd))
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	return;
	}

	NumBytes = 0;
	}

	// Restore the original stack pointer.
	// FIXME: Rather than doing the math here, we should instead just use
	// non-post-indexed loads for the restores if we aren't actually going to
	// be able to save any instructions.
	if (!IsFunclet && (MFI.hasVarSizedObjects() \|\| AFI->isStackRealigned())) {
	int64_t OffsetToFrameRecord =
	isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
	{OffsetToFrameRecord, MVT::i8},
	TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
	} else if (NumBytes)
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
	{NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
	NeedsWinCFI);

	// This must be placed after the callee-save restore code because that code
	// assumes the SP is at the same location as it was after the callee-save save
	// code in the prologue.
	if (AfterCSRPopSize) {
	// Find an insertion point for the first ldp so that it goes before the
	// shadow call stack epilog instruction. This ensures that the restore of
	// lr from x18 is placed after the restore from sp.
	auto FirstSPPopI = MBB.getFirstTerminator();
	while (FirstSPPopI != Begin) {
	auto Prev = std::prev(FirstSPPopI);
	if (Prev->getOpcode() != AArch64::LDRXpre \|\|
	Prev->getOperand(0).getReg() == AArch64::SP)
	break;
	FirstSPPopI = Prev;
	}

	adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);

	emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
	{(int64_t)AfterCSRPopSize, MVT::i8}, TII,
	MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
	}
	if (NeedsWinCFI && HasWinCFI)
	BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
	.setMIFlag(MachineInstr::FrameDestroy);

	MF.setHasWinCFI(HasWinCFI);
	}

	/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
	/// debug info. It's the same as what we use for resolving the code-gen
	/// references for now. FIXME: This can go wrong when references are
	/// SP-relative and simple call frames aren't used.
	int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
	int FI,
	Register &FrameReg) const {
	return resolveFrameIndexReference(
	MF, FI, FrameReg,
	/PreferFP=/
	MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
	/ForSimm=/false)
	.getBytes();
	}

	int AArch64FrameLowering::getNonLocalFrameIndexReference(
	const MachineFunction &MF, int FI) const {
	return getSEHFrameIndexOffset(MF, FI);
	}

	static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) {
	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	bool IsWin64 =
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());

	unsigned FixedObject =
	getFixedObjectSize(MF, AFI, IsWin64, /IsFunclet=/false);
	unsigned FPAdjust = isTargetDarwin(MF)
	? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
	return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
	}

	static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) {
	const auto &MFI = MF.getFrameInfo();
	return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8};
	}

	int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
	int FI) const {
	const auto RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
	return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
	? getFPOffset(MF, ObjectOffset).getBytes()
	: getStackOffset(MF, ObjectOffset).getBytes();
	}

	StackOffset AArch64FrameLowering::resolveFrameIndexReference(
	const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
	bool ForSimm) const {
	const auto &MFI = MF.getFrameInfo();
	int64_t ObjectOffset = MFI.getObjectOffset(FI);
	bool isFixed = MFI.isFixedObjectIndex(FI);
	bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
	return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
	PreferFP, ForSimm);
	}

	StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
	const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
	Register &FrameReg, bool PreferFP, bool ForSimm) const {
	const auto &MFI = MF.getFrameInfo();
	const auto RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();

	int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
	int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes();
	bool isCSR =
	!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));

	const StackOffset &SVEStackSize = getSVEStackSize(MF);

	// Use frame pointer to reference fixed objects. Use it for locals if
	// there are VLAs or a dynamically realigned SP (and thus the SP isn't
	// reliable as a base). Make sure useFPForScavengingIndex() does the
	// right thing for the emergency spill slot.
	bool UseFP = false;
	if (AFI->hasStackFrame() && !isSVE) {
	// We shouldn't prefer using the FP when there is an SVE area
	// in between the FP and the non-SVE locals/spills.
	PreferFP &= !SVEStackSize;

	// Note: Keeping the following as multiple 'if' statements rather than
	// merging to a single expression for readability.
	//
	// Argument access should always use the FP.
	if (isFixed) {
	UseFP = hasFP(MF);
	} else if (isCSR && RegInfo->needsStackRealignment(MF)) {
	// References to the CSR area must use FP if we're re-aligning the stack
	// since the dynamically-sized alignment padding is between the SP/BP and
	// the CSR area.
	assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
	UseFP = true;
	} else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
	// If the FPOffset is negative and we're producing a signed immediate, we
	// have to keep in mind that the available offset range for negative
	// offsets is smaller than for positive ones. If an offset is available
	// via the FP and the SP, use whichever is closest.
	bool FPOffsetFits = !ForSimm \|\| FPOffset >= -256;
	PreferFP \|= Offset > -FPOffset;

	if (MFI.hasVarSizedObjects()) {
	// If we have variable sized objects, we can use either FP or BP, as the
	// SP offset is unknown. We can use the base pointer if we have one and
	// FP is not preferred. If not, we're stuck with using FP.
	bool CanUseBP = RegInfo->hasBasePointer(MF);
	if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
	UseFP = PreferFP;
	else if (!CanUseBP) // Can't use BP. Forced to use FP.
	UseFP = true;
	// else we can use BP and FP, but the offset from FP won't fit.
	// That will make us scavenge registers which we can probably avoid by
	// using BP. If it won't fit for BP either, we'll scavenge anyway.
	} else if (FPOffset >= 0) {
	// Use SP or FP, whichever gives us the best chance of the offset
	// being in range for direct access. If the FPOffset is positive,
	// that'll always be best, as the SP will be even further away.
	UseFP = true;
	} else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
	// Funclets access the locals contained in the parent's stack frame
	// via the frame pointer, so we have to use the FP in the parent
	// function.
	(void) Subtarget;
	assert(
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
	"Funclets should only be present on Win64");
	UseFP = true;
	} else {
	// We have the choice between FP and (SP or BP).
	if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
	UseFP = true;
	}
	}
	}

	assert(((isFixed \|\| isCSR) \|\| !RegInfo->needsStackRealignment(MF) \|\| !UseFP) &&
	"In the presence of dynamic stack pointer realignment, "
	"non-argument/CSR objects cannot be accessed through the frame pointer");

	if (isSVE) {
	int64_t OffsetToSVEArea =
	MFI.getStackSize() - AFI->getCalleeSavedStackSize();
	StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
	StackOffset SPOffset = SVEStackSize +
	StackOffset(ObjectOffset, MVT::nxv1i8) +
	StackOffset(OffsetToSVEArea, MVT::i8);
	// Always use the FP for SVE spills if available and beneficial.
	if (hasFP(MF) &&
	(SPOffset.getBytes() \|\|
	FPOffset.getScalableBytes() < SPOffset.getScalableBytes() \|\|
	RegInfo->needsStackRealignment(MF))) {
	FrameReg = RegInfo->getFrameRegister(MF);
	return FPOffset;
	}

	FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
	: (unsigned)AArch64::SP;
	return SPOffset;
	}

	StackOffset ScalableOffset = {};
	if (UseFP && !(isFixed \|\| isCSR))
	ScalableOffset = -SVEStackSize;
	if (!UseFP && (isFixed \|\| isCSR))
	ScalableOffset = SVEStackSize;

	if (UseFP) {
	FrameReg = RegInfo->getFrameRegister(MF);
	return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
	}

	// Use the base pointer if we have one.
	if (RegInfo->hasBasePointer(MF))
	FrameReg = RegInfo->getBaseRegister();
	else {
	assert(!MFI.hasVarSizedObjects() &&
	"Can't use SP when we have var sized objects.");
	FrameReg = AArch64::SP;
	// If we're using the red zone for this function, the SP won't actually
	// be adjusted, so the offsets will be negative. They're also all
	// within range of the signed 9-bit immediate instructions.
	if (canUseRedZone(MF))
	Offset -= AFI->getLocalStackSize();
	}

	return StackOffset(Offset, MVT::i8) + ScalableOffset;
	}

	static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
	// Do not set a kill flag on values that are also marked as live-in. This
	// happens with the @llvm-returnaddress intrinsic and with arguments passed in
	// callee saved registers.
	// Omitting the kill flags is conservatively correct even if the live-in
	// is not used after all.
	bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
	return getKillRegState(!IsLiveIn);
	}

	static bool produceCompactUnwindFrame(MachineFunction &MF) {
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	AttributeList Attrs = MF.getFunction().getAttributes();
	return Subtarget.isTargetMachO() &&
	!(Subtarget.getTargetLowering()->supportSwiftError() &&
	Attrs.hasAttrSomewhere(Attribute::SwiftError));
	}

	static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
	bool NeedsWinCFI) {
	// If we are generating register pairs for a Windows function that requires
	// EH support, then pair consecutive registers only. There are no unwind
	// opcodes for saves/restores of non-consectuve register pairs.
	// The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
	// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling

	// TODO: LR can be paired with any register. We don't support this yet in
	// the MCLayer. We need to add support for the save_lrpair unwind code.
	if (Reg2 == AArch64::FP)
	return true;
	if (!NeedsWinCFI)
	return false;
	if (Reg2 == Reg1 + 1)
	return false;
	return true;
	}

	/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
	/// WindowsCFI requires that only consecutive registers can be paired.
	/// LR and FP need to be allocated together when the frame needs to save
	/// the frame-record. This means any other register pairing with LR is invalid.
	static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
	bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) {
	if (UsesWinAAPCS)
	return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI);

	// If we need to store the frame record, don't pair any register
	// with LR other than FP.
	if (NeedsFrameRecord)
	return Reg2 == AArch64::LR;

	return false;
	}

	namespace {

	struct RegPairInfo {
	unsigned Reg1 = AArch64::NoRegister;
	unsigned Reg2 = AArch64::NoRegister;
	int FrameIdx;
	int Offset;
	enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;

	RegPairInfo() = default;

	bool isPaired() const { return Reg2 != AArch64::NoRegister; }

	unsigned getScale() const {
	switch (Type) {
	case PPR:
	return 2;
	case GPR:
	case FPR64:
	return 8;
	case ZPR:
	case FPR128:
	return 16;
	}
	llvm_unreachable("Unsupported type");
	}

	bool isScalable() const { return Type == PPR \|\| Type == ZPR; }
	};

	} // end anonymous namespace

	static void computeCalleeSaveRegisterPairs(
	MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
	bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {

	if (CSI.empty())
	return;

	bool IsWindows = isTargetWindows(MF);
	bool NeedsWinCFI = needsWinCFI(MF);
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	CallingConv::ID CC = MF.getFunction().getCallingConv();
	unsigned Count = CSI.size();
	(void)CC;
	// MachO's compact unwind format relies on all registers being stored in
	// pairs.
	assert((!produceCompactUnwindFrame(MF) \|\|
	CC == CallingConv::PreserveMost \|\|
	(Count & 1) == 0) &&
	"Odd number of callee-saved regs to spill!");
	int ByteOffset = AFI->getCalleeSavedStackSize();
	int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
	// On Linux, we will have either one or zero non-paired register. On Windows
	// with CFI, we can have multiple unpaired registers in order to utilize the
	// available unwind codes. This flag assures that the alignment fixup is done
	// only once, as intened.
	bool FixupDone = false;
	+
	for (unsigned i = 0; i < Count; ++i) {
	RegPairInfo RPI;
	RPI.Reg1 = CSI[i].getReg();

	if (AArch64::GPR64RegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::GPR;
	else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::FPR64;
	else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::FPR128;
	else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::ZPR;
	else if (AArch64::PPRRegClass.contains(RPI.Reg1))
	RPI.Type = RegPairInfo::PPR;
	else
	llvm_unreachable("Unsupported register class.");

	// Add the next reg to the pair if it is in the same register class.
	if (i + 1 < Count) {
	unsigned NextReg = CSI[i + 1].getReg();
	switch (RPI.Type) {
	case RegPairInfo::GPR:
	if (AArch64::GPR64RegClass.contains(NextReg) &&
	!invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI,
	NeedsFrameRecord))
	RPI.Reg2 = NextReg;
	break;
	case RegPairInfo::FPR64:
	if (AArch64::FPR64RegClass.contains(NextReg) &&
	!invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
	RPI.Reg2 = NextReg;
	break;
	case RegPairInfo::FPR128:
	if (AArch64::FPR128RegClass.contains(NextReg))
	RPI.Reg2 = NextReg;
	break;
	case RegPairInfo::PPR:
	case RegPairInfo::ZPR:
	break;
	}
	}

	// If either of the registers to be saved is the lr register, it means that
	// we also need to save lr in the shadow call stack.
	if ((RPI.Reg1 == AArch64::LR \|\| RPI.Reg2 == AArch64::LR) &&
	MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
	if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
	report_fatal_error("Must reserve x18 to use shadow call stack");
	NeedShadowCallStackProlog = true;
	}

	// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
	// list to come in sorted by frame index so that we can issue the store
	// pair instructions directly. Assert if we see anything otherwise.
	//
	// The order of the registers in the list is controlled by
	// getCalleeSavedRegs(), so they will always be in-order, as well.
	assert((!RPI.isPaired() \|\|
	(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
	"Out of order callee saved regs!");

	assert((!RPI.isPaired() \|\| !NeedsFrameRecord \|\| RPI.Reg2 != AArch64::FP \|\|
	RPI.Reg1 == AArch64::LR) &&
	"FrameRecord must be allocated together with LR");

	// Windows AAPCS has FP and LR reversed.
	assert((!RPI.isPaired() \|\| !NeedsFrameRecord \|\| RPI.Reg1 != AArch64::FP \|\|
	RPI.Reg2 == AArch64::LR) &&
	"FrameRecord must be allocated together with LR");

	// MachO's compact unwind format relies on all registers being stored in
	// adjacent register pairs.
	assert((!produceCompactUnwindFrame(MF) \|\|
	CC == CallingConv::PreserveMost \|\|
	(RPI.isPaired() &&
	((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) \|\|
	RPI.Reg1 + 1 == RPI.Reg2))) &&
	"Callee-save registers not saved as adjacent register pair!");

	RPI.FrameIdx = CSI[i].getFrameIdx();

	int Scale = RPI.getScale();
	if (RPI.isScalable())
	ScalableByteOffset -= Scale;
	else
	ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;

	assert(!(RPI.isScalable() && RPI.isPaired()) &&
	"Paired spill/fill instructions don't exist for SVE vectors");

	// Round up size of non-pair to pair size if we need to pad the
	// callee-save area to ensure 16-byte alignment.
	if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
	!RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
	!RPI.isPaired()) {
	FixupDone = true;
	ByteOffset -= 8;
	assert(ByteOffset % 16 == 0);
	assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
	MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
	}

	int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
	assert(Offset % Scale == 0);
	RPI.Offset = Offset / Scale;

	assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) \|\|
	(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
	"Offset out of bounds for LDP/STP immediate");

	RegPairs.push_back(RPI);
	if (RPI.isPaired())
	++i;
	}
	}

	bool AArch64FrameLowering::spillCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	bool NeedsWinCFI = needsWinCFI(MF);
	DebugLoc DL;
	SmallVector<RegPairInfo, 8> RegPairs;

	bool NeedShadowCallStackProlog = false;
	computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
	NeedShadowCallStackProlog, hasFP(MF));
	const MachineRegisterInfo &MRI = MF.getRegInfo();

	if (NeedShadowCallStackProlog) {
	// Shadow call stack prolog: str x30, [x18], #8
	BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
	.addReg(AArch64::X18, RegState::Define)
	.addReg(AArch64::LR)
	.addReg(AArch64::X18)
	.addImm(8)
	.setMIFlag(MachineInstr::FrameSetup);

	if (NeedsWinCFI)
	BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
	.setMIFlag(MachineInstr::FrameSetup);

	if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
	// Emit a CFI instruction that causes 8 to be subtracted from the value of
	// x18 when unwinding past this frame.
	static const char CFIInst[] = {
	dwarf::DW_CFA_val_expression,
	18, // register
	2, // length
	static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
	static_cast<char>(-8) & 0x7f, // addend (sleb128)
	};
	unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
	nullptr, StringRef(CFIInst, sizeof(CFIInst))));
	BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// This instruction also makes x18 live-in to the entry block.
	MBB.addLiveIn(AArch64::X18);
	}

	for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
	++RPII) {
	RegPairInfo RPI = *RPII;
	unsigned Reg1 = RPI.Reg1;
	unsigned Reg2 = RPI.Reg2;
	unsigned StrOpc;

	// Issue sequence of spills for cs regs. The first spill may be converted
	// to a pre-decrement store later by emitPrologue if the callee-save stack
	// area allocation can't be combined with the local stack area allocation.
	// For example:
	// stp x22, x21, [sp, #0] // addImm(+0)
	// stp x20, x19, [sp, #16] // addImm(+2)
	// stp fp, lr, [sp, #32] // addImm(+4)
	// Rationale: This sequence saves uop updates compared to a sequence of
	// pre-increment spills like stp xi,xj,[sp,#-16]!
	// Note: Similar rationale and sequence for restores in epilog.
	unsigned Size;
	Align Alignment;
	switch (RPI.Type) {
	case RegPairInfo::GPR:
	StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR64:
	StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR128:
	StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::ZPR:
	StrOpc = AArch64::STR_ZXI;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::PPR:
	StrOpc = AArch64::STR_PXI;
	Size = 2;
	Alignment = Align(2);
	break;
	}
	LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
	if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
	dbgs() << ") -> fi#(" << RPI.FrameIdx;
	if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
	dbgs() << ")\n");

	assert((!NeedsWinCFI \|\| !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
	"Windows unwdinding requires a consecutive (FP,LR) pair");
	// Windows unwind codes require consecutive registers if registers are
	// paired. Make the switch here, so that the code below will save (x,x+1)
	// and not (x+1,x).
	unsigned FrameIdxReg1 = RPI.FrameIdx;
	unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
	if (NeedsWinCFI && RPI.isPaired()) {
	std::swap(Reg1, Reg2);
	std::swap(FrameIdxReg1, FrameIdxReg2);
	}
	MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
	if (!MRI.isReserved(Reg1))
	MBB.addLiveIn(Reg1);
	if (RPI.isPaired()) {
	if (!MRI.isReserved(Reg2))
	MBB.addLiveIn(Reg2);
	MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
	MachineMemOperand::MOStore, Size, Alignment));
	}
	MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
	.addReg(AArch64::SP)
	.addImm(RPI.Offset) // [sp, #offset*scale],
	// where factor*scale is implicit
	.setMIFlag(MachineInstr::FrameSetup);
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
	MachineMemOperand::MOStore, Size, Alignment));
	if (NeedsWinCFI)
	InsertSEH(MIB, TII, MachineInstr::FrameSetup);

	// Update the StackIDs of the SVE stack slots.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	if (RPI.Type == RegPairInfo::ZPR \|\| RPI.Type == RegPairInfo::PPR)
	MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);

	}
	return true;
	}

	bool AArch64FrameLowering::restoreCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	DebugLoc DL;
	SmallVector<RegPairInfo, 8> RegPairs;
	bool NeedsWinCFI = needsWinCFI(MF);

	if (MI != MBB.end())
	DL = MI->getDebugLoc();

	bool NeedShadowCallStackProlog = false;
	computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
	NeedShadowCallStackProlog, hasFP(MF));

	auto EmitMI = [&](const RegPairInfo &RPI) {
	unsigned Reg1 = RPI.Reg1;
	unsigned Reg2 = RPI.Reg2;

	// Issue sequence of restores for cs regs. The last restore may be converted
	// to a post-increment load later by emitEpilogue if the callee-save stack
	// area allocation can't be combined with the local stack area allocation.
	// For example:
	// ldp fp, lr, [sp, #32] // addImm(+4)
	// ldp x20, x19, [sp, #16] // addImm(+2)
	// ldp x22, x21, [sp, #0] // addImm(+0)
	// Note: see comment in spillCalleeSavedRegisters()
	unsigned LdrOpc;
	unsigned Size;
	Align Alignment;
	switch (RPI.Type) {
	case RegPairInfo::GPR:
	LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR64:
	LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
	Size = 8;
	Alignment = Align(8);
	break;
	case RegPairInfo::FPR128:
	LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::ZPR:
	LdrOpc = AArch64::LDR_ZXI;
	Size = 16;
	Alignment = Align(16);
	break;
	case RegPairInfo::PPR:
	LdrOpc = AArch64::LDR_PXI;
	Size = 2;
	Alignment = Align(2);
	break;
	}
	LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
	if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
	dbgs() << ") -> fi#(" << RPI.FrameIdx;
	if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
	dbgs() << ")\n");

	// Windows unwind codes require consecutive registers if registers are
	// paired. Make the switch here, so that the code below will save (x,x+1)
	// and not (x+1,x).
	unsigned FrameIdxReg1 = RPI.FrameIdx;
	unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
	if (NeedsWinCFI && RPI.isPaired()) {
	std::swap(Reg1, Reg2);
	std::swap(FrameIdxReg1, FrameIdxReg2);
	}
	MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
	if (RPI.isPaired()) {
	MIB.addReg(Reg2, getDefRegState(true));
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
	MachineMemOperand::MOLoad, Size, Alignment));
	}
	MIB.addReg(Reg1, getDefRegState(true))
	.addReg(AArch64::SP)
	.addImm(RPI.Offset) // [sp, #offset*scale]
	// where factor*scale is implicit
	.setMIFlag(MachineInstr::FrameDestroy);
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
	MachineMemOperand::MOLoad, Size, Alignment));
	if (NeedsWinCFI)
	InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
	};

	// SVE objects are always restored in reverse order.
	for (const RegPairInfo &RPI : reverse(RegPairs))
	if (RPI.isScalable())
	EmitMI(RPI);

	if (ReverseCSRRestoreSeq) {
	for (const RegPairInfo &RPI : reverse(RegPairs))
	if (!RPI.isScalable())
	EmitMI(RPI);
	} else
	for (const RegPairInfo &RPI : RegPairs)
	if (!RPI.isScalable())
	EmitMI(RPI);

	if (NeedShadowCallStackProlog) {
	// Shadow call stack epilog: ldr x30, [x18, #-8]!
	BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
	.addReg(AArch64::X18, RegState::Define)
	.addReg(AArch64::LR, RegState::Define)
	.addReg(AArch64::X18)
	.addImm(-8)
	.setMIFlag(MachineInstr::FrameDestroy);
	}

	return true;
	}

	void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
	return;

	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
	const AArch64RegisterInfo RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	unsigned UnspilledCSGPR = AArch64::NoRegister;
	unsigned UnspilledCSGPRPaired = AArch64::NoRegister;

	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();

	unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
	? RegInfo->getBaseRegister()
	: (unsigned)AArch64::NoRegister;

	unsigned ExtraCSSpill = 0;
	// Figure out which callee-saved registers to save/restore.
	for (unsigned i = 0; CSRegs[i]; ++i) {
	const unsigned Reg = CSRegs[i];

	// Add the base pointer register to SavedRegs if it is callee-save.
	if (Reg == BasePointerReg)
	SavedRegs.set(Reg);

	bool RegUsed = SavedRegs.test(Reg);
	unsigned PairedReg = AArch64::NoRegister;
	if (AArch64::GPR64RegClass.contains(Reg) \|\|
	AArch64::FPR64RegClass.contains(Reg) \|\|
	AArch64::FPR128RegClass.contains(Reg))
	PairedReg = CSRegs[i ^ 1];

	if (!RegUsed) {
	if (AArch64::GPR64RegClass.contains(Reg) &&
	!RegInfo->isReservedReg(MF, Reg)) {
	UnspilledCSGPR = Reg;
	UnspilledCSGPRPaired = PairedReg;
	}
	continue;
	}

	// MachO's compact unwind format relies on all registers being stored in
	// pairs.
	// FIXME: the usual format is actually better if unwinding isn't needed.
	if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
	!SavedRegs.test(PairedReg)) {
	SavedRegs.set(PairedReg);
	if (AArch64::GPR64RegClass.contains(PairedReg) &&
	!RegInfo->isReservedReg(MF, PairedReg))
	ExtraCSSpill = PairedReg;
	}
	}

	if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
	!Subtarget.isTargetWindows()) {
	// For Windows calling convention on a non-windows OS, where X18 is treated
	// as reserved, back up X18 when entering non-windows code (marked with the
	// Windows calling convention) and restore when returning regardless of
	// whether the individual function uses it - it might call other functions
	// that clobber it.
	SavedRegs.set(AArch64::X18);
	}

	// Calculates the callee saved stack size.
	unsigned CSStackSize = 0;
	unsigned SVECSStackSize = 0;
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	for (unsigned Reg : SavedRegs.set_bits()) {
	auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
	if (AArch64::PPRRegClass.contains(Reg) \|\|
	AArch64::ZPRRegClass.contains(Reg))
	SVECSStackSize += RegSize;
	else
	CSStackSize += RegSize;
	}

	// Save number of saved regs, so we can easily update CSStackSize later.
	unsigned NumSavedRegs = SavedRegs.count();

	// The frame record needs to be created by saving the appropriate registers
	uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
	if (hasFP(MF) \|\|
	windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
	SavedRegs.set(AArch64::FP);
	SavedRegs.set(AArch64::LR);
	}

	LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
	for (unsigned Reg
	: SavedRegs.set_bits()) dbgs()
	<< ' ' << printReg(Reg, RegInfo);
	dbgs() << "\n";);

	// If any callee-saved registers are used, the frame cannot be eliminated.
	int64_t SVEStackSize =
	alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
	bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;

	// The CSR spill slots have not been allocated yet, so estimateStackSize
	// won't include them.
	unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);

	// Conservatively always assume BigStack when there are SVE spills.
	bool BigStack = SVEStackSize \|\|
	(EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
	if (BigStack \|\| !CanEliminateFrame \|\| RegInfo->cannotEliminateFrame(MF))
	AFI->setHasStackFrame(true);

	// Estimate if we might need to scavenge a register at some point in order
	// to materialize a stack offset. If so, either spill one additional
	// callee-saved register or reserve a special spill slot to facilitate
	// register scavenging. If we already spilled an extra callee-saved register
	// above to keep the number of spills even, we don't need to do anything else
	// here.
	if (BigStack) {
	if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
	LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
	<< " to get a scratch register.\n");
	SavedRegs.set(UnspilledCSGPR);
	// MachO's compact unwind format relies on all registers being stored in
	// pairs, so if we need to spill one extra for BigStack, then we need to
	// store the pair.
	if (produceCompactUnwindFrame(MF))
	SavedRegs.set(UnspilledCSGPRPaired);
	ExtraCSSpill = UnspilledCSGPR;
	}

	// If we didn't find an extra callee-saved register to spill, create
	// an emergency spill slot.
	if (!ExtraCSSpill \|\| MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	const TargetRegisterClass &RC = AArch64::GPR64RegClass;
	unsigned Size = TRI->getSpillSize(RC);
	Align Alignment = TRI->getSpillAlign(RC);
	int FI = MFI.CreateStackObject(Size, Alignment, false);
	RS->addScavengingFrameIndex(FI);
	LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
	<< " as the emergency spill slot.\n");
	}
	}

	// Adding the size of additional 64bit GPR saves.
	CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
	uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
	LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
	<< EstimatedStackSize + AlignedCSStackSize
	<< " bytes.\n");

	assert((!MFI.isCalleeSavedInfoValid() \|\|
	AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
	"Should not invalidate callee saved info");

	// Round up to register pair alignment to avoid additional SP adjustment
	// instructions.
	AFI->setCalleeSavedStackSize(AlignedCSStackSize);
	AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
	AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
	}

	bool AArch64FrameLowering::enableStackSlotScavenging(
	const MachineFunction &MF) const {
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	return AFI->hasCalleeSaveStackFreeSpace();
	}

	/// returns true if there are any SVE callee saves.
	static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
	int &Min, int &Max) {
	Min = std::numeric_limits<int>::max();
	Max = std::numeric_limits<int>::min();

	if (!MFI.isCalleeSavedInfoValid())
	return false;

	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	for (auto &CS : CSI) {
	if (AArch64::ZPRRegClass.contains(CS.getReg()) \|\|
	AArch64::PPRRegClass.contains(CS.getReg())) {
	assert((Max == std::numeric_limits<int>::min() \|\|
	Max + 1 == CS.getFrameIdx()) &&
	"SVE CalleeSaves are not consecutive");

	Min = std::min(Min, CS.getFrameIdx());
	Max = std::max(Max, CS.getFrameIdx());
	}
	}
	return Min != std::numeric_limits<int>::max();
	}

	// Process all the SVE stack objects and determine offsets for each
	// object. If AssignOffsets is true, the offsets get assigned.
	// Fills in the first and last callee-saved frame indices into
	// Min/MaxCSFrameIndex, respectively.
	// Returns the size of the stack.
	static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
	int &MinCSFrameIndex,
	int &MaxCSFrameIndex,
	bool AssignOffsets) {
	#ifndef NDEBUG
	// First process all fixed stack objects.
	for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
	assert(MFI.getStackID(I) != TargetStackID::SVEVector &&
	"SVE vectors should never be passed on the stack by value, only by "
	"reference.");
	#endif

	auto Assign = [&MFI](int FI, int64_t Offset) {
	LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
	MFI.setObjectOffset(FI, Offset);
	};

	int64_t Offset = 0;

	// Then process all callee saved slots.
	if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
	// Assign offsets to the callee save slots.
	for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
	Offset += MFI.getObjectSize(I);
	Offset = alignTo(Offset, MFI.getObjectAlign(I));
	if (AssignOffsets)
	Assign(I, -Offset);
	}
	}

	// Ensure that the Callee-save area is aligned to 16bytes.
	Offset = alignTo(Offset, Align(16U));

	// Create a buffer of SVE objects to allocate and sort it.
	SmallVector<int, 8> ObjectsToAllocate;
	for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
	unsigned StackID = MFI.getStackID(I);
	if (StackID != TargetStackID::SVEVector)
	continue;
	if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
	continue;
	if (MFI.isDeadObjectIndex(I))
	continue;

	ObjectsToAllocate.push_back(I);
	}

	// Allocate all SVE locals and spills
	for (unsigned FI : ObjectsToAllocate) {
	Align Alignment = MFI.getObjectAlign(FI);
	// FIXME: Given that the length of SVE vectors is not necessarily a power of
	// two, we'd need to align every object dynamically at runtime if the
	// alignment is larger than 16. This is not yet supported.
	if (Alignment > Align(16))
	report_fatal_error(
	"Alignment of scalable vectors > 16 bytes is not yet supported");

	Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
	if (AssignOffsets)
	Assign(FI, -Offset);
	}

	return Offset;
	}

	int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
	MachineFrameInfo &MFI) const {
	int MinCSFrameIndex, MaxCSFrameIndex;
	return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
	}

	int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
	MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
	return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
	true);
	}

	void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
	MachineFunction &MF, RegScavenger *RS) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();

	assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
	"Upwards growing stack unsupported");

	int MinCSFrameIndex, MaxCSFrameIndex;
	int64_t SVEStackSize =
	assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);

	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
	AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);

	// If this function isn't doing Win64-style C++ EH, we don't need to do
	// anything.
	if (!MF.hasEHFunclets())
	return;
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();

	MachineBasicBlock &MBB = MF.front();
	auto MBBI = MBB.begin();
	while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
	++MBBI;

	// Create an UnwindHelp object.
	// The UnwindHelp object is allocated at the start of the fixed object area
	int64_t FixedObject =
	getFixedObjectSize(MF, AFI, /IsWin64/ true, /IsFunclet/ false);
	int UnwindHelpFI = MFI.CreateFixedObject(/Size/ 8,
	/SPOffset/ -FixedObject,
	/IsImmutable=/false);
	EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;

	// We need to store -2 into the UnwindHelp object at the start of the
	// function.
	DebugLoc DL;
	RS->enterBasicBlockEnd(MBB);
	RS->backward(std::prev(MBBI));
	unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
	assert(DstReg && "There must be a free register after frame setup");
	BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
	BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
	.addReg(DstReg, getKillRegState(true))
	.addFrameIndex(UnwindHelpFI)
	.addImm(0);
	}

	namespace {
	struct TagStoreInstr {
	MachineInstr *MI;
	int64_t Offset, Size;
	explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
	: MI(MI), Offset(Offset), Size(Size) {}
	};

	class TagStoreEdit {
	MachineFunction *MF;
	MachineBasicBlock *MBB;
	MachineRegisterInfo *MRI;
	// Tag store instructions that are being replaced.
	SmallVector<TagStoreInstr, 8> TagStores;
	// Combined memref arguments of the above instructions.
	SmallVector<MachineMemOperand *, 8> CombinedMemRefs;

	// Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
	// FrameRegOffset + Size) with the address tag of SP.
	Register FrameReg;
	StackOffset FrameRegOffset;
	int64_t Size;
	// If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
	Optional<int64_t> FrameRegUpdate;
	// MIFlags for any FrameReg updating instructions.
	unsigned FrameRegUpdateFlags;

	// Use zeroing instruction variants.
	bool ZeroData;
	DebugLoc DL;

	void emitUnrolled(MachineBasicBlock::iterator InsertI);
	void emitLoop(MachineBasicBlock::iterator InsertI);

	public:
	TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
	: MBB(MBB), ZeroData(ZeroData) {
	MF = MBB->getParent();
	MRI = &MF->getRegInfo();
	}
	// Add an instruction to be replaced. Instructions must be added in the
	// ascending order of Offset, and have to be adjacent.
	void addInstruction(TagStoreInstr I) {
	assert((TagStores.empty() \|\|
	TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
	"Non-adjacent tag store instructions.");
	TagStores.push_back(I);
	}
	void clear() { TagStores.clear(); }
	// Emit equivalent code at the given location, and erase the current set of
	// instructions. May skip if the replacement is not profitable. May invalidate
	// the input iterator and replace it with a valid one.
	void emitCode(MachineBasicBlock::iterator &InsertI,
	const AArch64FrameLowering *TFI, bool IsLast);
	};

	void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
	const AArch64InstrInfo *TII =
	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();

	const int64_t kMinOffset = -256 * 16;
	const int64_t kMaxOffset = 255 * 16;

	Register BaseReg = FrameReg;
	int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
	if (BaseRegOffsetBytes < kMinOffset \|\|
	BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
	Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
	emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
	{BaseRegOffsetBytes, MVT::i8}, TII);
	BaseReg = ScratchReg;
	BaseRegOffsetBytes = 0;
	}

	MachineInstr *LastI = nullptr;
	while (Size) {
	int64_t InstrSize = (Size > 16) ? 32 : 16;
	unsigned Opcode =
	InstrSize == 16
	? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
	: (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
	MachineInstr I = BuildMI(MBB, InsertI, DL, TII->get(Opcode))
	.addReg(AArch64::SP)
	.addReg(BaseReg)
	.addImm(BaseRegOffsetBytes / 16)
	.setMemRefs(CombinedMemRefs);
	// A store to [BaseReg, #0] should go last for an opportunity to fold the
	// final SP adjustment in the epilogue.
	if (BaseRegOffsetBytes == 0)
	LastI = I;
	BaseRegOffsetBytes += InstrSize;
	Size -= InstrSize;
	}

	if (LastI)
	MBB->splice(InsertI, MBB, LastI);
	}

	void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
	const AArch64InstrInfo *TII =
	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();

	Register BaseReg = FrameRegUpdate
	? FrameReg
	: MRI->createVirtualRegister(&AArch64::GPR64RegClass);
	Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);

	emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);

	int64_t LoopSize = Size;
	// If the loop size is not a multiple of 32, split off one 16-byte store at
	// the end to fold BaseReg update into.
	if (FrameRegUpdate && *FrameRegUpdate)
	LoopSize -= LoopSize % 32;
	MachineInstr LoopI = BuildMI(MBB, InsertI, DL,
	TII->get(ZeroData ? AArch64::STZGloop_wback
	: AArch64::STGloop_wback))
	.addDef(SizeReg)
	.addDef(BaseReg)
	.addImm(LoopSize)
	.addReg(BaseReg)
	.setMemRefs(CombinedMemRefs);
	if (FrameRegUpdate)
	LoopI->setFlags(FrameRegUpdateFlags);

	int64_t ExtraBaseRegUpdate =
	FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
	if (LoopSize < Size) {
	assert(FrameRegUpdate);
	assert(Size - LoopSize == 16);
	// Tag 16 more bytes at BaseReg and update BaseReg.
	BuildMI(*MBB, InsertI, DL,
	TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
	.addDef(BaseReg)
	.addReg(BaseReg)
	.addReg(BaseReg)
	.addImm(1 + ExtraBaseRegUpdate / 16)
	.setMemRefs(CombinedMemRefs)
	.setMIFlags(FrameRegUpdateFlags);
	} else if (ExtraBaseRegUpdate) {
	// Update BaseReg.
	BuildMI(
	*MBB, InsertI, DL,
	TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
	.addDef(BaseReg)
	.addReg(BaseReg)
	.addImm(std::abs(ExtraBaseRegUpdate))
	.addImm(0)
	.setMIFlags(FrameRegUpdateFlags);
	}
	}

	// Check if *II is a register update that can be merged into STGloop that ends
	// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
	// end of the loop.
	bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
	int64_t Size, int64_t *TotalOffset) {
	MachineInstr &MI = *II;
	if ((MI.getOpcode() == AArch64::ADDXri \|\|
	MI.getOpcode() == AArch64::SUBXri) &&
	MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
	unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
	int64_t Offset = MI.getOperand(2).getImm() << Shift;
	if (MI.getOpcode() == AArch64::SUBXri)
	Offset = -Offset;
	int64_t AbsPostOffset = std::abs(Offset - Size);
	const int64_t kMaxOffset =
	0xFFF; // Max encoding for unshifted ADDXri / SUBXri
	if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
	*TotalOffset = Offset;
	return true;
	}
	}
	return false;
	}

	void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
	SmallVectorImpl<MachineMemOperand *> &MemRefs) {
	MemRefs.clear();
	for (auto &TS : TSE) {
	MachineInstr *MI = TS.MI;
	// An instruction without memory operands may access anything. Be
	// conservative and return an empty list.
	if (MI->memoperands_empty()) {
	MemRefs.clear();
	return;
	}
	MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
	}
	}

	void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
	const AArch64FrameLowering *TFI, bool IsLast) {
	if (TagStores.empty())
	return;
	TagStoreInstr &FirstTagStore = TagStores[0];
	TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
	Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
	DL = TagStores[0].MI->getDebugLoc();

	Register Reg;
	FrameRegOffset = TFI->resolveFrameOffsetReference(
	MF, FirstTagStore.Offset, false /isFixed/, false /isSVE*/, Reg,
	/PreferFP=/false, /ForSimm=/true);
	FrameReg = Reg;
	FrameRegUpdate = None;

	mergeMemRefs(TagStores, CombinedMemRefs);

	LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
	for (const auto &Instr
	: TagStores) { dbgs() << " " << *Instr.MI; });

	// Size threshold where a loop becomes shorter than a linear sequence of
	// tagging instructions.
	const int kSetTagLoopThreshold = 176;
	if (Size < kSetTagLoopThreshold) {
	if (TagStores.size() < 2)
	return;
	emitUnrolled(InsertI);
	} else {
	MachineInstr *UpdateInstr = nullptr;
	int64_t TotalOffset;
	if (IsLast) {
	// See if we can merge base register update into the STGloop.
	// This is done in AArch64LoadStoreOptimizer for "normal" stores,
	// but STGloop is way too unusual for that, and also it only
	// realistically happens in function epilogue. Also, STGloop is expanded
	// before that pass.
	if (InsertI != MBB->end() &&
	canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
	&TotalOffset)) {
	UpdateInstr = &*InsertI++;
	LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
	<< *UpdateInstr);
	}
	}

	if (!UpdateInstr && TagStores.size() < 2)
	return;

	if (UpdateInstr) {
	FrameRegUpdate = TotalOffset;
	FrameRegUpdateFlags = UpdateInstr->getFlags();
	}
	emitLoop(InsertI);
	if (UpdateInstr)
	UpdateInstr->eraseFromParent();
	}

	for (auto &TS : TagStores)
	TS.MI->eraseFromParent();
	}

	bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
	int64_t &Size, bool &ZeroData) {
	MachineFunction &MF = *MI.getParent()->getParent();
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	unsigned Opcode = MI.getOpcode();
	ZeroData = (Opcode == AArch64::STZGloop \|\| Opcode == AArch64::STZGOffset \|\|
	Opcode == AArch64::STZ2GOffset);

	if (Opcode == AArch64::STGloop \|\| Opcode == AArch64::STZGloop) {
	if (!MI.getOperand(0).isDead() \|\| !MI.getOperand(1).isDead())
	return false;
	if (!MI.getOperand(2).isImm() \|\| !MI.getOperand(3).isFI())
	return false;
	Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
	Size = MI.getOperand(2).getImm();
	return true;
	}

	if (Opcode == AArch64::STGOffset \|\| Opcode == AArch64::STZGOffset)
	Size = 16;
	else if (Opcode == AArch64::ST2GOffset \|\| Opcode == AArch64::STZ2GOffset)
	Size = 32;
	else
	return false;

	if (MI.getOperand(0).getReg() != AArch64::SP \|\| !MI.getOperand(1).isFI())
	return false;

	Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
	16 * MI.getOperand(2).getImm();
	return true;
	}

	// Detect a run of memory tagging instructions for adjacent stack frame slots,
	// and replace them with a shorter instruction sequence:
	// * replace STG + STG with ST2G
	// * replace STGloop + STGloop with STGloop
	// This code needs to run when stack slot offsets are already known, but before
	// FrameIndex operands in STG instructions are eliminated.
	MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
	const AArch64FrameLowering *TFI,
	RegScavenger *RS) {
	bool FirstZeroData;
	int64_t Size, Offset;
	MachineInstr &MI = *II;
	MachineBasicBlock *MBB = MI.getParent();
	MachineBasicBlock::iterator NextI = ++II;
	if (&MI == &MBB->instr_back())
	return II;
	if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
	return II;

	SmallVector<TagStoreInstr, 4> Instrs;
	Instrs.emplace_back(&MI, Offset, Size);

	constexpr int kScanLimit = 10;
	int Count = 0;
	for (MachineBasicBlock::iterator E = MBB->end();
	NextI != E && Count < kScanLimit; ++NextI) {
	MachineInstr &MI = *NextI;
	bool ZeroData;
	int64_t Size, Offset;
	// Collect instructions that update memory tags with a FrameIndex operand
	// and (when applicable) constant size, and whose output registers are dead
	// (the latter is almost always the case in practice). Since these
	// instructions effectively have no inputs or outputs, we are free to skip
	// any non-aliasing instructions in between without tracking used registers.
	if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
	if (ZeroData != FirstZeroData)
	break;
	Instrs.emplace_back(&MI, Offset, Size);
	continue;
	}

	// Only count non-transient, non-tagging instructions toward the scan
	// limit.
	if (!MI.isTransient())
	++Count;

	// Just in case, stop before the epilogue code starts.
	if (MI.getFlag(MachineInstr::FrameSetup) \|\|
	MI.getFlag(MachineInstr::FrameDestroy))
	break;

	// Reject anything that may alias the collected instructions.
	if (MI.mayLoadOrStore() \|\| MI.hasUnmodeledSideEffects())
	break;
	}

	// New code will be inserted after the last tagging instruction we've found.
	MachineBasicBlock::iterator InsertI = Instrs.back().MI;
	InsertI++;

	llvm::stable_sort(Instrs,
	[](const TagStoreInstr &Left, const TagStoreInstr &Right) {
	return Left.Offset < Right.Offset;
	});

	// Make sure that we don't have any overlapping stores.
	int64_t CurOffset = Instrs[0].Offset;
	for (auto &Instr : Instrs) {
	if (CurOffset > Instr.Offset)
	return NextI;
	CurOffset = Instr.Offset + Instr.Size;
	}

	// Find contiguous runs of tagged memory and emit shorter instruction
	// sequencies for them when possible.
	TagStoreEdit TSE(MBB, FirstZeroData);
	Optional<int64_t> EndOffset;
	for (auto &Instr : Instrs) {
	if (EndOffset && *EndOffset != Instr.Offset) {
	// Found a gap.
	TSE.emitCode(InsertI, TFI, /IsLast = / false);
	TSE.clear();
	}

	TSE.addInstruction(Instr);
	EndOffset = Instr.Offset + Instr.Size;
	}

	TSE.emitCode(InsertI, TFI, /IsLast = / true);

	return InsertI;
	}
	} // namespace

	void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
	MachineFunction &MF, RegScavenger *RS = nullptr) const {
	if (StackTaggingMergeSetTag)
	for (auto &BB : MF)
	for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
	II = tryMergeAdjacentSTG(II, this, RS);
	}

	/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
	/// before the update. This is easily retrieved as it is exactly the offset
	/// that is set in processFunctionBeforeFrameFinalized.
	int AArch64FrameLowering::getFrameIndexReferencePreferSP(
	const MachineFunction &MF, int FI, Register &FrameReg,
	bool IgnoreSPUpdates) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	if (IgnoreSPUpdates) {
	LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
	<< MFI.getObjectOffset(FI) << "\n");
	FrameReg = AArch64::SP;
	return MFI.getObjectOffset(FI);
	}

	return getFrameIndexReference(MF, FI, FrameReg);
	}

	/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
	/// the parent's frame pointer
	unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
	const MachineFunction &MF) const {
	return 0;
	}

	/// Funclets only need to account for space for the callee saved registers,
	/// as the locals are accounted for in the parent's stack frame.
	unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
	const MachineFunction &MF) const {
	// This is the size of the pushed CSRs.
	unsigned CSSize =
	MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
	// This is the amount of stack a funclet needs to allocate.
	return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
	getStackAlign());
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
	index 444740cb50ab..1ca8c3e9e2bf 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
	@@ -1,128 +1,135 @@
	//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --- C++ --==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	//
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
	#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H

	#include "AArch64StackOffset.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"

	namespace llvm {

	+class MCCFIInstruction;
	+
	class AArch64FrameLowering : public TargetFrameLowering {
	public:
	explicit AArch64FrameLowering()
	: TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
	true /StackRealignable/) {}

	void
	emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI) const override;

	MachineBasicBlock::iterator
	eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const override;

	/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
	/// the function.
	void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
	void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;

	bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;

	int getFrameIndexReference(const MachineFunction &MF, int FI,
	Register &FrameReg) const override;
	StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
	Register &FrameReg, bool PreferFP,
	bool ForSimm) const;
	StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
	int64_t ObjectOffset, bool isFixed,
	bool isSVE, Register &FrameReg,
	bool PreferFP, bool ForSimm) const;
	bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	ArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI) const override;

	bool
	restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	MutableArrayRef<CalleeSavedInfo> CSI,
	const TargetRegisterInfo *TRI) const override;

	/// Can this function use the red zone for local allocations.
	bool canUseRedZone(const MachineFunction &MF) const;

	bool hasFP(const MachineFunction &MF) const override;
	bool hasReservedCallFrame(const MachineFunction &MF) const override;

	void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
	RegScavenger *RS) const override;

	/// Returns true if the target will correctly handle shrink wrapping.
	bool enableShrinkWrapping(const MachineFunction &MF) const override {
	return true;
	}

	bool enableStackSlotScavenging(const MachineFunction &MF) const override;
	TargetStackID::Value getStackIDForScalableVectors() const override;

	void processFunctionBeforeFrameFinalized(MachineFunction &MF,
	RegScavenger *RS) const override;

	void
	processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
	RegScavenger *RS) const override;

	unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;

	unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;

	int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
	Register &FrameReg,
	bool IgnoreSPUpdates) const override;
	int getNonLocalFrameIndexReference(const MachineFunction &MF,
	int FI) const override;
	int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;

	bool isSupportedStackID(TargetStackID::Value ID) const override {
	switch (ID) {
	default:
	return false;
	case TargetStackID::Default:
	case TargetStackID::SVEVector:
	case TargetStackID::NoAlloc:
	return true;
	}
	}

	bool isStackIdSafeForLocalArea(unsigned StackId) const override {
	// We don't support putting SVE objects into the pre-allocated local
	// frame block at the moment.
	return StackId != TargetStackID::SVEVector;
	}

	private:
	bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
	uint64_t StackBumpBytes) const;

	int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
	int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
	int &MinCSFrameIndex,
	int &MaxCSFrameIndex) const;
	+ MCCFIInstruction
	+ createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI,
	+ const StackOffset &OffsetFromSP) const;
	+ MCCFIInstruction createCfaOffset(const TargetRegisterInfo &MRI, unsigned DwarfReg,
	+ const StackOffset &OffsetFromDefCFA) const;
	bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
	unsigned StackBumpBytes) const;
	};

	} // End llvm namespace

	#endif
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	index 1500da2fdfc7..45bfa85bdc07 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	@@ -1,15177 +1,15187 @@
	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the AArch64TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ISelLowering.h"
	#include "AArch64CallingConvention.h"
	#include "AArch64ExpandImm.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64PerfectShuffle.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicsAArch64.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "aarch64-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");

	// FIXME: The necessary dtprel relocations don't seem to be supported
	// well in the GNU bfd and gold linkers at the moment. Therefore, by
	// default, for now, fall back to GeneralDynamic code generation.
	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
	"aarch64-elf-ldtls-generation", cl::Hidden,
	cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
	cl::init(false));

	static cl::opt<bool>
	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
	cl::desc("Enable AArch64 logical imm instruction "
	"optimization"),
	cl::init(true));

	/// Value type used for condition codes.
	static const MVT MVT_CC = MVT::i32;

	/// Returns true if VT's elements occupy the lowest bit positions of its
	/// associated register class without any intervening space.
	///
	/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
	/// same register class, but only nxv8f16 can be treated as a packed vector.
	static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
	assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal vector type!");
	return VT.isFixedLengthVector() \|\|
	VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
	}

	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
	// we have to make something up. Arbitrarily, choose ZeroOrOne.
	setBooleanContents(ZeroOrOneBooleanContent);
	// When comparing vectors the result sets the different elements in the
	// vector to all-one or all-zero.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);

	if (Subtarget->hasFPARMv8()) {
	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
	}

	if (Subtarget->hasNEON()) {
	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
	// Someone set us up the NEON.
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);
	addDRTypeForNEON(MVT::v1f64);
	addDRTypeForNEON(MVT::v4f16);
	addDRTypeForNEON(MVT::v4bf16);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);
	addQRTypeForNEON(MVT::v8f16);
	addQRTypeForNEON(MVT::v8bf16);
	}

	if (Subtarget->hasSVE()) {
	// Add legal sve predicate types
	addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
	addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
	addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
	addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);

	// Add legal sve data types
	addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);

	addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);

	if (Subtarget->hasBF16()) {
	addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
	addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
	}

	if (useSVEForFixedLengthVectors()) {
	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addRegisterClass(VT, &AArch64::ZPRRegClass);

	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addRegisterClass(VT, &AArch64::ZPRRegClass);
	}

	for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}

	for (auto VT :
	{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
	MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);

	for (auto VT :
	{ MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
	MVT::nxv2f64 }) {
	setCondCodeAction(ISD::SETO, VT, Expand);
	setCondCodeAction(ISD::SETOLT, VT, Expand);
	setCondCodeAction(ISD::SETOLE, VT, Expand);
	setCondCodeAction(ISD::SETULT, VT, Expand);
	setCondCodeAction(ISD::SETULE, VT, Expand);
	setCondCodeAction(ISD::SETUGE, VT, Expand);
	setCondCodeAction(ISD::SETUGT, VT, Expand);
	setCondCodeAction(ISD::SETUEQ, VT, Expand);
	setCondCodeAction(ISD::SETUNE, VT, Expand);
	}
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget->getRegisterInfo());

	// Provide all sorts of operation actions
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::f16, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f16, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f16, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f80, Expand);

	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

	// Custom lowering hooks are needed for XOR
	// to fold it into CSINC/CSINV.
	setOperationAction(ISD::XOR, MVT::i32, Custom);
	setOperationAction(ISD::XOR, MVT::i64, Custom);

	// Virtually no operation on f128 is legal, but LLVM can't expand them when
	// there's a valid register class, so we need custom operations in most cases.
	setOperationAction(ISD::FABS, MVT::f128, Expand);
	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FMA, MVT::f128, Expand);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	setOperationAction(ISD::FRINT, MVT::f128, Expand);
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
	setOperationAction(ISD::SELECT, MVT::f128, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

	// Lowering for many of the conversions is actually specified by the non-f128
	// type. The LowerXXX function will be trivial when f128 isn't involved.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

	// Variable arguments.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Variable-sized objects.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
	else
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	// Constant pool entries
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);

	// BlockAddress
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);

	// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);

	// AArch64 lacks both left-rotate and popcount instructions.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}

	// AArch64 doesn't have i32 MULH{S\|U}.
	setOperationAction(ISD::MULHU, MVT::i32, Expand);
	setOperationAction(ISD::MULHS, MVT::i32, Expand);

	// AArch64 doesn't have {U\|S}MUL_LOHI.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);
	setOperationAction(ISD::CTPOP, MVT::i128, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	// Custom lower Add/Sub/Mul with overflow.
	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::SADDO, MVT::i64, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i64, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i32, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);
	setOperationAction(ISD::UMULO, MVT::i32, Custom);
	setOperationAction(ISD::UMULO, MVT::i64, Custom);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
	if (Subtarget->hasFullFP16())
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
	else
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);

	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FREM, MVT::v4f16, Expand);
	setOperationAction(ISD::FREM, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);

	if (!Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SELECT, MVT::f16, Promote);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
	setOperationAction(ISD::SETCC, MVT::f16, Promote);
	setOperationAction(ISD::BR_CC, MVT::f16, Promote);
	setOperationAction(ISD::FADD, MVT::f16, Promote);
	setOperationAction(ISD::FSUB, MVT::f16, Promote);
	setOperationAction(ISD::FMUL, MVT::f16, Promote);
	setOperationAction(ISD::FDIV, MVT::f16, Promote);
	setOperationAction(ISD::FMA, MVT::f16, Promote);
	setOperationAction(ISD::FNEG, MVT::f16, Promote);
	setOperationAction(ISD::FABS, MVT::f16, Promote);
	setOperationAction(ISD::FCEIL, MVT::f16, Promote);
	setOperationAction(ISD::FSQRT, MVT::f16, Promote);
	setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
	setOperationAction(ISD::FRINT, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Promote);
	setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
	setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);

	// promote v4f16 to v4f32 when that is known to be safe.
	setOperationAction(ISD::FADD, MVT::v4f16, Promote);
	setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
	setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
	setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
	AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);

	setOperationAction(ISD::FABS, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
	setOperationAction(ISD::FMA, MVT::v4f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);

	setOperationAction(ISD::FABS, MVT::v8f16, Expand);
	setOperationAction(ISD::FADD, MVT::v8f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
	setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
	setOperationAction(ISD::FMA, MVT::v8f16, Expand);
	setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::f32, MVT::f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	setOperationAction(ISD::FMINNUM, Ty, Legal);
	setOperationAction(ISD::FMAXNUM, Ty, Legal);
	setOperationAction(ISD::FMINIMUM, Ty, Legal);
	setOperationAction(ISD::FMAXIMUM, Ty, Legal);
	setOperationAction(ISD::LROUND, Ty, Legal);
	setOperationAction(ISD::LLROUND, Ty, Legal);
	setOperationAction(ISD::LRINT, Ty, Legal);
	setOperationAction(ISD::LLRINT, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
	setOperationAction(ISD::FCEIL, MVT::f16, Legal);
	setOperationAction(ISD::FRINT, MVT::f16, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
	setOperationAction(ISD::FROUND, MVT::f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);

	// 128-bit loads and stores can be done without expanding
	setOperationAction(ISD::LOAD, MVT::i128, Custom);
	setOperationAction(ISD::STORE, MVT::i128, Custom);

	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
	// custom lowering, as there are no un-paired non-temporal stores and
	// legalization will break up 256 bit inputs.
	setOperationAction(ISD::STORE, MVT::v32i8, Custom);
	setOperationAction(ISD::STORE, MVT::v16i16, Custom);
	setOperationAction(ISD::STORE, MVT::v16f16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i32, Custom);
	setOperationAction(ISD::STORE, MVT::v8f32, Custom);
	setOperationAction(ISD::STORE, MVT::v4f64, Custom);
	setOperationAction(ISD::STORE, MVT::v4i64, Custom);

	// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
	// This requires the Performance Monitors extension.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	// Issue __sincos_stret if available.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	} else {
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	}

	if (Subtarget->getTargetTriple().isOSMSVCRT()) {
	// MSVCRT doesn't have powi; fall back to pow
	setLibcallName(RTLIB::POWI_F32, nullptr);
	setLibcallName(RTLIB::POWI_F64, nullptr);
	}

	// Make floating-point constants legal for the large code model, so they don't
	// become loads from the constant pool.
	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
	}

	// AArch64 does not have floating-point extending loads, i1 sign-extending
	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
	}
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);
	setOperationAction(ISD::BITCAST, MVT::bf16, Custom);

	// Indexed loads and stores are supported.
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedLoadAction(im, MVT::i64, Legal);
	setIndexedLoadAction(im, MVT::f64, Legal);
	setIndexedLoadAction(im, MVT::f32, Legal);
	setIndexedLoadAction(im, MVT::f16, Legal);
	setIndexedLoadAction(im, MVT::bf16, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i64, Legal);
	setIndexedStoreAction(im, MVT::f64, Legal);
	setIndexedStoreAction(im, MVT::f32, Legal);
	setIndexedStoreAction(im, MVT::f16, Legal);
	setIndexedStoreAction(im, MVT::bf16, Legal);
	}

	// Trap.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// We combine OR nodes for bitfield operations.
	setTargetDAGCombine(ISD::OR);
	// Try to create BICs for vector ANDs.
	setTargetDAGCombine(ISD::AND);

	// Vector add and sub nodes may conceal a high-half opportunity.
	// Also, try to fold ADD into CSINC/CSINV..
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);

	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);

	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::STORE);
	if (Subtarget->supportsAddressTopByteIgnored())
	setTargetDAGCombine(ISD::LOAD);

	setTargetDAGCombine(ISD::MUL);

	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::VSELECT);

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

	setTargetDAGCombine(ISD::GlobalAddress);

	// In case of strict alignment, avoid an excessive number of byte wide stores.
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemset = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemsetOptSize : 32;

	MaxGluedStoresPerMemcpy = 4;
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemcpyOptSize : 16;

	MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;

	MaxLoadsPerMemcmpOptSize = 4;
	MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
	? MaxLoadsPerMemcmpOptSize : 8;

	setStackPointerRegisterToSaveRestore(AArch64::SP);

	setSchedulingPreference(Sched::Hybrid);

	EnableExtLdPromotion = true;

	// Set required alignment.
	setMinFunctionAlignment(Align(4));
	// Set preferred alignments.
	setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
	setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));

	// Only change the limit for entries in a jump table if specified by
	// the sub target, but not at the command line.
	unsigned MaxJT = STI.getMaximumJumpTableSize();
	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
	setMaximumJumpTableSize(MaxJT);

	setHasExtractBitsInsn(true);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget->hasNEON()) {
	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
	// silliness like this:
	setOperationAction(ISD::FABS, MVT::v1f64, Expand);
	setOperationAction(ISD::FADD, MVT::v1f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
	setOperationAction(ISD::FMA, MVT::v1f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
	setOperationAction(ISD::FREM, MVT::v1f64, Expand);
	setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
	setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);

	setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);

	setOperationAction(ISD::MUL, MVT::v1i64, Expand);

	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
	// elements smaller than i32, so promote the input to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
	// i8 vector elements also need promotion to i32 for v8i8
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
	} else {
	// when AArch64 doesn't have fullfp16 support, promote the input
	// to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
	}

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	// AArch64 doesn't have MUL.2d:
	setOperationAction(ISD::MUL, MVT::v2i64, Expand);
	// Custom handling for some quad-vector types to detect MULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);

	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	// Vector reductions
	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);

	// Saturates
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	}
	for (MVT VT : { MVT::v4f16, MVT::v2f32,
	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
	}

	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	// Likewise, narrowing and extending vector loads/stores aren't handled
	// directly.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
	setOperationAction(ISD::MULHS, VT, Legal);
	setOperationAction(ISD::MULHU, VT, Legal);
	} else {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	}
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);

	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}
	}

	if (Subtarget->hasSVE())
	setOperationAction(ISD::VSCALE, MVT::i32, Custom);

	setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
	}

	if (Subtarget->hasSVE()) {
	// FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
	// splat of 0 or undef) once vector selects supported in SVE codegen. See
	// D68877 for more details.
	for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
	if (isTypeLegal(VT)) {
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::SMIN, VT, Custom);
	setOperationAction(ISD::UMIN, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Custom);
	setOperationAction(ISD::UMAX, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	if (VT.getScalarType() == MVT::i1) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
	}
	}
	}

	for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);

	for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
	if (isTypeLegal(VT)) {
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::FMA, VT, Custom);
	}
	}

	// NOTE: Currently this has to happen after computeRegisterProperties rather
	// than the preferred option of combining it with the addRegisterClass call.
	if (useSVEForFixedLengthVectors()) {
	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addTypeForFixedLengthSVE(VT);
	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
	if (useSVEForFixedLengthVectorVT(VT))
	addTypeForFixedLengthSVE(VT);

	// 64bit results can mean a bigger than NEON input.
	for (auto VT : {MVT::v8i8, MVT::v4i16})
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

	// 128bit results imply a bigger than NEON input.
	for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	for (auto VT : {MVT::v8f16, MVT::v4f32})
	setOperationAction(ISD::FP_ROUND, VT, Expand);
	}
	}

	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
	}

	void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
	assert(VT.isVector() && "VT should be a vector type");

	if (VT.isFloatingPoint()) {
	MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
	setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
	setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
	}

	// Mark vector float intrinsics as expand.
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);

	// But we do support custom-lowering for FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::OR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	for (MVT InnerVT : MVT::all_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// CNT supports only B element sizes, then use UADDLP to widen.
	if (VT != MVT::v8i8 && VT != MVT::v16i8)
	setOperationAction(ISD::CTPOP, VT, Custom);

	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);

	if (!VT.isFloatingPoint())
	setOperationAction(ISD::ABS, VT, Legal);

	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);

	// F[MIN\|MAX][NUM\|NAN] are available for all FP NEON types.
	if (VT.isFloatingPoint() &&
	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
	for (unsigned Opcode :
	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
	setOperationAction(Opcode, VT, Legal);

	if (Subtarget->isLittleEndian()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, VT, Legal);
	setIndexedStoreAction(im, VT, Legal);
	}
	}
	}

	void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");

	// By default everything must be expanded.
	for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
	setOperationAction(Op, VT, Expand);

	// We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Lower fixed length vector operations to scalable equivalents.
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::LOAD, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	}

	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR64RegClass);
	addTypeForNEON(VT, MVT::v2i32);
	}

	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR128RegClass);
	addTypeForNEON(VT, MVT::v4i32);
	}

	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
	LLVMContext &C, EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	if (VT.isScalableVector())
	return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
	return VT.changeVectorElementTypeToInteger();
	}

	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
	const APInt &Demanded,
	TargetLowering::TargetLoweringOpt &TLO,
	unsigned NewOpc) {
	uint64_t OldImm = Imm, NewImm, Enc;
	uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;

	// Return if the immediate is already all zeros, all ones, a bimm32 or a
	// bimm64.
	if (Imm == 0 \|\| Imm == Mask \|\|
	AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
	return false;

	unsigned EltSize = Size;
	uint64_t DemandedBits = Demanded.getZExtValue();

	// Clear bits that are not demanded.
	Imm &= DemandedBits;

	while (true) {
	// The goal here is to set the non-demanded bits in a way that minimizes
	// the number of switching between 0 and 1. In order to achieve this goal,
	// we set the non-demanded bits to the value of the preceding demanded bits.
	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
	// The final result is 0b11000011.
	uint64_t NonDemandedBits = ~DemandedBits;
	uint64_t InvertedImm = ~Imm & DemandedBits;
	uint64_t RotatedImm =
	((InvertedImm << 1) \| (InvertedImm >> (EltSize - 1) & 1)) &
	NonDemandedBits;
	uint64_t Sum = RotatedImm + NonDemandedBits;
	bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
	NewImm = (Imm \| Ones) & Mask;

	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
	// we halve the element size and continue the search.
	if (isShiftedMask_64(NewImm) \|\| isShiftedMask_64(~(NewImm \| ~Mask)))
	break;

	// We cannot shrink the element size any further if it is 2-bits.
	if (EltSize == 2)
	return false;

	EltSize /= 2;
	Mask >>= EltSize;
	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;

	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
	return false;

	// Merge the upper and lower halves of Imm and DemandedBits.
	Imm \|= Hi;
	DemandedBits \|= DemandedBitsHi;
	}

	++NumOptimizedImms;

	// Replicate the element across the register width.
	while (EltSize < Size) {
	NewImm \|= NewImm << EltSize;
	EltSize *= 2;
	}

	(void)OldImm;
	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
	"demanded bits should never be altered");
	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");

	// Create the new constant immediate node.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue New;

	// If the new constant immediate is all-zeros or all-ones, let the target
	// independent DAG combine optimize this node.
	if (NewImm == 0 \|\| NewImm == OrigMask) {
	New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
	TLO.DAG.getConstant(NewImm, DL, VT));
	// Otherwise, create a machine node so that target independent DAG combine
	// doesn't undo this optimization.
	} else {
	Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
	SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
	New = SDValue(
	TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
	}

	return TLO.CombineTo(Op, New);
	}

	bool AArch64TargetLowering::targetShrinkDemandedConstant(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	TargetLoweringOpt &TLO) const {
	// Delay this optimization to as late as possible.
	if (!TLO.LegalOps)
	return false;

	if (!EnableOptimizeLogicalImm)
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();
	assert((Size == 32 \|\| Size == 64) &&
	"i32 or i64 is expected after legalization.");

	// Exit early if we demand all bits.
	if (DemandedBits.countPopulation() == Size)
	return false;

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default:
	return false;
	case ISD::AND:
	NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
	break;
	case ISD::OR:
	NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
	break;
	case ISD::XOR:
	NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
	break;
	}
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;
	uint64_t Imm = C->getZExtValue();
	return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
	}

	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
	/// Mask are known to be either zero or one and return them Known.
	void AArch64TargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case AArch64ISD::CSEL: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
	Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case AArch64ISD::LOADgot:
	case AArch64ISD::ADDlow: {
	if (!Subtarget->isTargetILP32())
	break;
	// In ILP32 mode all valid pointers are in the low 4GB of the address-space.
	Known.Zero = APInt::getHighBitsSet(64, 32);
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	unsigned BitWidth = Known.getBitWidth();
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_umaxv:
	case Intrinsic::aarch64_neon_uminv: {
	// Figure out the datatype of the vector operand. The UMINV instruction
	// will zero extend the result, so we can mark as known zero all the
	// bits larger than the element datatype. 32-bit or larget doesn't need
	// this as those are legal types and will be handled by isel directly.
	MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
	unsigned BitWidth = Known.getBitWidth();
	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
	assert(BitWidth >= 8 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
	Known.Zero \|= Mask;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
	assert(BitWidth >= 16 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	Known.Zero \|= Mask;
	}
	break;
	} break;
	}
	}
	}
	}

	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::i64;
	}

	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Align <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	VT == MVT::v2i64;
	}
	return true;
	}

	// Same as above but handling LLTs instead.
	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
	LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\|
	Ty.getSizeInBytes() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Alignment <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	Ty == LLT::vector(2, 64);
	}
	return true;
	}

	FastISel *
	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return AArch64::createFastISel(funcInfo, libInfo);
	}

	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
	#define MAKE_CASE(V) \
	case V: \
	return #V;
	switch ((AArch64ISD::NodeType)Opcode) {
	case AArch64ISD::FIRST_NUMBER:
	break;
	MAKE_CASE(AArch64ISD::CALL)
	MAKE_CASE(AArch64ISD::ADRP)
	MAKE_CASE(AArch64ISD::ADR)
	MAKE_CASE(AArch64ISD::ADDlow)
	MAKE_CASE(AArch64ISD::LOADgot)
	MAKE_CASE(AArch64ISD::RET_FLAG)
	MAKE_CASE(AArch64ISD::BRCOND)
	MAKE_CASE(AArch64ISD::CSEL)
	MAKE_CASE(AArch64ISD::FCSEL)
	MAKE_CASE(AArch64ISD::CSINV)
	MAKE_CASE(AArch64ISD::CSNEG)
	MAKE_CASE(AArch64ISD::CSINC)
	MAKE_CASE(AArch64ISD::THREAD_POINTER)
	MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
	MAKE_CASE(AArch64ISD::ADD_PRED)
	MAKE_CASE(AArch64ISD::SDIV_PRED)
	MAKE_CASE(AArch64ISD::UDIV_PRED)
	MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
	MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
	MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
	MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::ADC)
	MAKE_CASE(AArch64ISD::SBC)
	MAKE_CASE(AArch64ISD::ADDS)
	MAKE_CASE(AArch64ISD::SUBS)
	MAKE_CASE(AArch64ISD::ADCS)
	MAKE_CASE(AArch64ISD::SBCS)
	MAKE_CASE(AArch64ISD::ANDS)
	MAKE_CASE(AArch64ISD::CCMP)
	MAKE_CASE(AArch64ISD::CCMN)
	MAKE_CASE(AArch64ISD::FCCMP)
	MAKE_CASE(AArch64ISD::FCMP)
	MAKE_CASE(AArch64ISD::STRICT_FCMP)
	MAKE_CASE(AArch64ISD::STRICT_FCMPE)
	MAKE_CASE(AArch64ISD::DUP)
	MAKE_CASE(AArch64ISD::DUPLANE8)
	MAKE_CASE(AArch64ISD::DUPLANE16)
	MAKE_CASE(AArch64ISD::DUPLANE32)
	MAKE_CASE(AArch64ISD::DUPLANE64)
	MAKE_CASE(AArch64ISD::MOVI)
	MAKE_CASE(AArch64ISD::MOVIshift)
	MAKE_CASE(AArch64ISD::MOVIedit)
	MAKE_CASE(AArch64ISD::MOVImsl)
	MAKE_CASE(AArch64ISD::FMOV)
	MAKE_CASE(AArch64ISD::MVNIshift)
	MAKE_CASE(AArch64ISD::MVNImsl)
	MAKE_CASE(AArch64ISD::BICi)
	MAKE_CASE(AArch64ISD::ORRi)
	MAKE_CASE(AArch64ISD::BSP)
	MAKE_CASE(AArch64ISD::NEG)
	MAKE_CASE(AArch64ISD::EXTR)
	MAKE_CASE(AArch64ISD::ZIP1)
	MAKE_CASE(AArch64ISD::ZIP2)
	MAKE_CASE(AArch64ISD::UZP1)
	MAKE_CASE(AArch64ISD::UZP2)
	MAKE_CASE(AArch64ISD::TRN1)
	MAKE_CASE(AArch64ISD::TRN2)
	MAKE_CASE(AArch64ISD::REV16)
	MAKE_CASE(AArch64ISD::REV32)
	MAKE_CASE(AArch64ISD::REV64)
	MAKE_CASE(AArch64ISD::EXT)
	MAKE_CASE(AArch64ISD::VSHL)
	MAKE_CASE(AArch64ISD::VLSHR)
	MAKE_CASE(AArch64ISD::VASHR)
	MAKE_CASE(AArch64ISD::VSLI)
	MAKE_CASE(AArch64ISD::VSRI)
	MAKE_CASE(AArch64ISD::CMEQ)
	MAKE_CASE(AArch64ISD::CMGE)
	MAKE_CASE(AArch64ISD::CMGT)
	MAKE_CASE(AArch64ISD::CMHI)
	MAKE_CASE(AArch64ISD::CMHS)
	MAKE_CASE(AArch64ISD::FCMEQ)
	MAKE_CASE(AArch64ISD::FCMGE)
	MAKE_CASE(AArch64ISD::FCMGT)
	MAKE_CASE(AArch64ISD::CMEQz)
	MAKE_CASE(AArch64ISD::CMGEz)
	MAKE_CASE(AArch64ISD::CMGTz)
	MAKE_CASE(AArch64ISD::CMLEz)
	MAKE_CASE(AArch64ISD::CMLTz)
	MAKE_CASE(AArch64ISD::FCMEQz)
	MAKE_CASE(AArch64ISD::FCMGEz)
	MAKE_CASE(AArch64ISD::FCMGTz)
	MAKE_CASE(AArch64ISD::FCMLEz)
	MAKE_CASE(AArch64ISD::FCMLTz)
	MAKE_CASE(AArch64ISD::SADDV)
	MAKE_CASE(AArch64ISD::UADDV)
	MAKE_CASE(AArch64ISD::SRHADD)
	MAKE_CASE(AArch64ISD::URHADD)
	MAKE_CASE(AArch64ISD::SMINV)
	MAKE_CASE(AArch64ISD::UMINV)
	MAKE_CASE(AArch64ISD::SMAXV)
	MAKE_CASE(AArch64ISD::UMAXV)
	MAKE_CASE(AArch64ISD::SMAXV_PRED)
	MAKE_CASE(AArch64ISD::UMAXV_PRED)
	MAKE_CASE(AArch64ISD::SMINV_PRED)
	MAKE_CASE(AArch64ISD::UMINV_PRED)
	MAKE_CASE(AArch64ISD::ORV_PRED)
	MAKE_CASE(AArch64ISD::EORV_PRED)
	MAKE_CASE(AArch64ISD::ANDV_PRED)
	MAKE_CASE(AArch64ISD::CLASTA_N)
	MAKE_CASE(AArch64ISD::CLASTB_N)
	MAKE_CASE(AArch64ISD::LASTA)
	MAKE_CASE(AArch64ISD::LASTB)
	MAKE_CASE(AArch64ISD::REV)
	MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
	MAKE_CASE(AArch64ISD::TBL)
	MAKE_CASE(AArch64ISD::FADD_PRED)
	MAKE_CASE(AArch64ISD::FADDA_PRED)
	MAKE_CASE(AArch64ISD::FADDV_PRED)
	MAKE_CASE(AArch64ISD::FMA_PRED)
	MAKE_CASE(AArch64ISD::FMAXV_PRED)
	MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
	MAKE_CASE(AArch64ISD::FMINV_PRED)
	MAKE_CASE(AArch64ISD::FMINNMV_PRED)
	MAKE_CASE(AArch64ISD::NOT)
	MAKE_CASE(AArch64ISD::BIT)
	MAKE_CASE(AArch64ISD::CBZ)
	MAKE_CASE(AArch64ISD::CBNZ)
	MAKE_CASE(AArch64ISD::TBZ)
	MAKE_CASE(AArch64ISD::TBNZ)
	MAKE_CASE(AArch64ISD::TC_RETURN)
	MAKE_CASE(AArch64ISD::PREFETCH)
	MAKE_CASE(AArch64ISD::SITOF)
	MAKE_CASE(AArch64ISD::UITOF)
	MAKE_CASE(AArch64ISD::NVCAST)
	MAKE_CASE(AArch64ISD::SQSHL_I)
	MAKE_CASE(AArch64ISD::UQSHL_I)
	MAKE_CASE(AArch64ISD::SRSHR_I)
	MAKE_CASE(AArch64ISD::URSHR_I)
	MAKE_CASE(AArch64ISD::SQSHLU_I)
	MAKE_CASE(AArch64ISD::WrapperLarge)
	MAKE_CASE(AArch64ISD::LD2post)
	MAKE_CASE(AArch64ISD::LD3post)
	MAKE_CASE(AArch64ISD::LD4post)
	MAKE_CASE(AArch64ISD::ST2post)
	MAKE_CASE(AArch64ISD::ST3post)
	MAKE_CASE(AArch64ISD::ST4post)
	MAKE_CASE(AArch64ISD::LD1x2post)
	MAKE_CASE(AArch64ISD::LD1x3post)
	MAKE_CASE(AArch64ISD::LD1x4post)
	MAKE_CASE(AArch64ISD::ST1x2post)
	MAKE_CASE(AArch64ISD::ST1x3post)
	MAKE_CASE(AArch64ISD::ST1x4post)
	MAKE_CASE(AArch64ISD::LD1DUPpost)
	MAKE_CASE(AArch64ISD::LD2DUPpost)
	MAKE_CASE(AArch64ISD::LD3DUPpost)
	MAKE_CASE(AArch64ISD::LD4DUPpost)
	MAKE_CASE(AArch64ISD::LD1LANEpost)
	MAKE_CASE(AArch64ISD::LD2LANEpost)
	MAKE_CASE(AArch64ISD::LD3LANEpost)
	MAKE_CASE(AArch64ISD::LD4LANEpost)
	MAKE_CASE(AArch64ISD::ST2LANEpost)
	MAKE_CASE(AArch64ISD::ST3LANEpost)
	MAKE_CASE(AArch64ISD::ST4LANEpost)
	MAKE_CASE(AArch64ISD::SMULL)
	MAKE_CASE(AArch64ISD::UMULL)
	MAKE_CASE(AArch64ISD::FRECPE)
	MAKE_CASE(AArch64ISD::FRECPS)
	MAKE_CASE(AArch64ISD::FRSQRTE)
	MAKE_CASE(AArch64ISD::FRSQRTS)
	MAKE_CASE(AArch64ISD::STG)
	MAKE_CASE(AArch64ISD::STZG)
	MAKE_CASE(AArch64ISD::ST2G)
	MAKE_CASE(AArch64ISD::STZ2G)
	MAKE_CASE(AArch64ISD::SUNPKHI)
	MAKE_CASE(AArch64ISD::SUNPKLO)
	MAKE_CASE(AArch64ISD::UUNPKHI)
	MAKE_CASE(AArch64ISD::UUNPKLO)
	MAKE_CASE(AArch64ISD::INSR)
	MAKE_CASE(AArch64ISD::PTEST)
	MAKE_CASE(AArch64ISD::PTRUE)
	MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
	MAKE_CASE(AArch64ISD::ST1_PRED)
	MAKE_CASE(AArch64ISD::SST1_PRED)
	MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
	MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
	MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
	MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
	MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
	MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
	MAKE_CASE(AArch64ISD::SSTNT1_PRED)
	MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
	MAKE_CASE(AArch64ISD::LDP)
	MAKE_CASE(AArch64ISD::STP)
	MAKE_CASE(AArch64ISD::STNP)
	MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
	MAKE_CASE(AArch64ISD::INDEX_VECTOR)
	}
	#undef MAKE_CASE
	return nullptr;
	}

	MachineBasicBlock *
	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// We materialise the F128CSEL pseudo-instruction as some control flow and a
	// phi node:

	// OrigBB:
	// [... previous instrs leading to comparison ...]
	// b.ne TrueBB
	// b EndBB
	// TrueBB:
	// ; Fallthrough
	// EndBB:
	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]

	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator It = ++MBB->getIterator();

	Register DestReg = MI.getOperand(0).getReg();
	Register IfTrueReg = MI.getOperand(1).getReg();
	Register IfFalseReg = MI.getOperand(2).getReg();
	unsigned CondCode = MI.getOperand(3).getImm();
	bool NZCVKilled = MI.getOperand(4).isKill();

	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, TrueBB);
	MF->insert(It, EndBB);

	// Transfer rest of current basic-block to EndBB
	EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
	MBB->end());
	EndBB->transferSuccessorsAndUpdatePHIs(MBB);

	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
	MBB->addSuccessor(TrueBB);
	MBB->addSuccessor(EndBB);

	// TrueBB falls through to the end.
	TrueBB->addSuccessor(EndBB);

	if (!NZCVKilled) {
	TrueBB->addLiveIn(AArch64::NZCV);
	EndBB->addLiveIn(AArch64::NZCV);
	}

	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
	.addReg(IfTrueReg)
	.addMBB(TrueBB)
	.addReg(IfFalseReg)
	.addMBB(MBB);

	MI.eraseFromParent();
	return EndBB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
	BB->getParent()->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");
	return BB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	#ifndef NDEBUG
	MI.dump();
	#endif
	llvm_unreachable("Unexpected instruction for custom inserter!");

	case AArch64::F128CSEL:
	return EmitF128CSEL(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case AArch64::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	}
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Lowering private implementation.
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
	/// CC
	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETNE:
	return AArch64CC::NE;
	case ISD::SETEQ:
	return AArch64CC::EQ;
	case ISD::SETGT:
	return AArch64CC::GT;
	case ISD::SETGE:
	return AArch64CC::GE;
	case ISD::SETLT:
	return AArch64CC::LT;
	case ISD::SETLE:
	return AArch64CC::LE;
	case ISD::SETUGT:
	return AArch64CC::HI;
	case ISD::SETUGE:
	return AArch64CC::HS;
	case ISD::SETULT:
	return AArch64CC::LO;
	case ISD::SETULE:
	return AArch64CC::LS;
	}
	}

	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
	static void changeFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = AArch64CC::EQ;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	CondCode = AArch64CC::GT;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	CondCode = AArch64CC::GE;
	break;
	case ISD::SETOLT:
	CondCode = AArch64CC::MI;
	break;
	case ISD::SETOLE:
	CondCode = AArch64CC::LS;
	break;
	case ISD::SETONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case ISD::SETO:
	CondCode = AArch64CC::VC;
	break;
	case ISD::SETUO:
	CondCode = AArch64CC::VS;
	break;
	case ISD::SETUEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case ISD::SETUGT:
	CondCode = AArch64CC::HI;
	break;
	case ISD::SETUGE:
	CondCode = AArch64CC::PL;
	break;
	case ISD::SETLT:
	case ISD::SETULT:
	CondCode = AArch64CC::LT;
	break;
	case ISD::SETLE:
	case ISD::SETULE:
	CondCode = AArch64CC::LE;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	/// Convert a DAG fp condition code to an AArch64 CC.
	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
	/// should be AND'ed instead of OR'ed.
	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	assert(CondCode2 == AArch64CC::AL);
	break;
	case ISD::SETONE:
	// (a one b)
	// == ((a olt b) \|\| (a ogt b))
	// == ((a ord b) && (a une b))
	CondCode = AArch64CC::VC;
	CondCode2 = AArch64CC::NE;
	break;
	case ISD::SETUEQ:
	// (a ueq b)
	// == ((a uno b) \|\| (a oeq b))
	// == ((a ule b) && (a uge b))
	CondCode = AArch64CC::PL;
	CondCode2 = AArch64CC::LE;
	break;
	}
	}

	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
	/// CC usable with the vector instructions. Fewer operations are available
	/// without a real NZCV register, so we have to use less efficient combinations
	/// to get the same effect.
	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2,
	bool &Invert) {
	Invert = false;
	switch (CC) {
	default:
	// Mostly the scalar mappings work fine.
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GE;
	break;
	case ISD::SETUEQ:
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	// All of the compare-mask comparisons are ordered, but we can switch
	// between the two by a double inversion. E.g. ULE == !OGT.
	Invert = true;
	changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
	CondCode, CondCode2);
	break;
	}
	}

	static bool isLegalArithImmed(uint64_t C) {
	// Matches AArch64DAGToDAGISel::SelectArithImmed().
	bool IsLegal = (C >> 12 == 0) \|\| ((C & 0xFFFULL) == 0 && C >> 24 == 0);
	LLVM_DEBUG(dbgs() << "Is imm " << C
	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
	return IsLegal;
	}

	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
	// can be set differently by this operation. It comes down to whether
	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
	// everything is fine. If not then the optimization is wrong. Thus general
	// comparisons are only valid if op2 != 0.
	//
	// So, finally, the only LLVM-native comparisons that don't mention C and V
	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
	// the absence of information about op2.
	static bool isCMN(SDValue Op, ISD::CondCode CC) {
	return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE);
	}

	static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
	SelectionDAG &DAG, SDValue Chain,
	bool IsSignaling) {
	EVT VT = LHS.getValueType();
	assert(VT != MVT::f128);
	assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
	unsigned Opcode =
	IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
	return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
	}

	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT VT = LHS.getValueType();
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (VT.isFloatingPoint()) {
	assert(VT != MVT::f128);
	if (VT == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	VT = MVT::f32;
	}
	return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
	}

	// The CMP instruction is just an alias for SUBS, and representing it as
	// SUBS means that it's possible to get CSE with subtract operations.
	// A later phase can perform the optimization of setting the destination
	// register to WZR/XZR if it ends up being unused.
	unsigned Opcode = AArch64ISD::SUBS;

	if (isCMN(RHS, CC)) {
	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	RHS = RHS.getOperand(1);
	} else if (isCMN(LHS, CC)) {
	// As we are looking for EQ/NE compares, the operands can be commuted ; can
	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	LHS = LHS.getOperand(1);
	} else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
	if (LHS.getOpcode() == ISD::AND) {
	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
	// of the signed comparisons.
	const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
	DAG.getVTList(VT, MVT_CC),
	LHS.getOperand(0),
	LHS.getOperand(1));
	// Replace all users of (and X, Y) with newly generated (ands X, Y)
	DAG.ReplaceAllUsesWith(LHS, ANDSNode);
	return ANDSNode.getValue(1);
	} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
	// Use result of ANDS
	return LHS.getValue(1);
	}
	}

	return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
	.getValue(1);
	}

	/// \defgroup AArch64CCMP CMP;CCMP matching
	///
	/// These functions deal with the formation of CMP;CCMP;... sequences.
	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
	/// a comparison. They set the NZCV flags to a predefined value if their
	/// predicate is false. This allows to express arbitrary conjunctions, for
	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
	/// expressed as:
	/// cmp A
	/// ccmp B, inv(CB), CA
	/// check for CB flags
	///
	/// This naturally lets us implement chains of AND operations with SETCC
	/// operands. And we can even implement some other situations by transforming
	/// them:
	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
	/// negating the flags used in a CCMP/FCCMP operations.
	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
	/// by negating the flags we test for afterwards. i.e.
	/// NEG (CMP CCMP CCCMP ...) can be implemented.
	/// - Note that we can only ever negate all previously processed results.
	/// What we can not implement by flipping the flags to test is a negation
	/// of two sub-trees (because the negation affects all sub-trees emitted so
	/// far, so the 2nd sub-tree we emit would also affect the first).
	/// With those tools we can implement some OR operations:
	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
	/// elimination rules from earlier to implement the whole thing as a
	/// CCMP/FCCMP chain.
	///
	/// As complete example:
	/// or (or (setCA (cmp A)) (setCB (cmp B)))
	/// (and (setCC (cmp C)) (setCD (cmp D)))"
	/// can be reassociated to:
	/// or (and (setCC (cmp C)) setCD (cmp D))
	// (or (setCA (cmp A)) (setCB (cmp B)))
	/// can be transformed to:
	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
	/// which can be implemented as:
	/// cmp C
	/// ccmp D, inv(CD), CC
	/// ccmp A, CA, inv(CD)
	/// ccmp B, CB, inv(CA)
	/// check for CB flags
	///
	/// A counterexample is "or (and A B) (and C D)" which translates to
	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
	/// can only implement 1 of the inner (not) operations, but not both!
	/// @{

	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
	ISD::CondCode CC, SDValue CCOp,
	AArch64CC::CondCode Predicate,
	AArch64CC::CondCode OutCC,
	const SDLoc &DL, SelectionDAG &DAG) {
	unsigned Opcode = 0;
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (LHS.getValueType().isFloatingPoint()) {
	assert(LHS.getValueType() != MVT::f128);
	if (LHS.getValueType() == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
	}
	Opcode = AArch64ISD::FCCMP;
	} else if (RHS.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = RHS.getOperand(0);
	if (isNullConstant(SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// See emitComparison() on why we can only do this for SETEQ and SETNE.
	Opcode = AArch64ISD::CCMN;
	RHS = RHS.getOperand(1);
	}
	}
	if (Opcode == 0)
	Opcode = AArch64ISD::CCMP;

	SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
	return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
	}

	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
	/// expressed as a conjunction. See \ref AArch64CCMP.
	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
	/// changing the conditions on the SETCC tests.
	/// (this means we can call emitConjunctionRec() with
	/// Negate==true on this sub-tree)
	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
	/// cannot do the negation naturally. We are required to
	/// emit the subtree first in this case.
	/// \param WillNegate Is true if are called when the result of this
	/// subexpression must be negated. This happens when the
	/// outer expression is an OR. We can use this fact to know
	/// that we have a double negation (or (or ...) ...) that
	/// can be implemented for free.
	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
	bool &MustBeFirst, bool WillNegate,
	unsigned Depth = 0) {
	if (!Val.hasOneUse())
	return false;
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	if (Val->getOperand(0).getValueType() == MVT::f128)
	return false;
	CanNegate = true;
	MustBeFirst = false;
	return true;
	}
	// Protect against exponential runtime and stack overflow.
	if (Depth > 6)
	return false;
	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
	bool IsOR = Opcode == ISD::OR;
	SDValue O0 = Val->getOperand(0);
	SDValue O1 = Val->getOperand(1);
	bool CanNegateL;
	bool MustBeFirstL;
	if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
	return false;
	bool CanNegateR;
	bool MustBeFirstR;
	if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
	return false;

	if (MustBeFirstL && MustBeFirstR)
	return false;

	if (IsOR) {
	// For an OR expression we need to be able to naturally negate at least
	// one side or we cannot do the transformation at all.
	if (!CanNegateL && !CanNegateR)
	return false;
	// If we the result of the OR will be negated and we can naturally negate
	// the leafs, then this sub-tree as a whole negates naturally.
	CanNegate = WillNegate && CanNegateL && CanNegateR;
	// If we cannot naturally negate the whole sub-tree, then this must be
	// emitted first.
	MustBeFirst = !CanNegate;
	} else {
	assert(Opcode == ISD::AND && "Must be OR or AND");
	// We cannot naturally negate an AND operation.
	CanNegate = false;
	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
	}
	return true;
	}
	return false;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// Tries to transform the given i1 producing node @p Val to a series compare
	/// and conditional compare operations. @returns an NZCV flags producing node
	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
	/// transformation was not possible.
	/// \p Negate is true if we want this sub-tree being negated just by changing
	/// SETCC conditions.
	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
	AArch64CC::CondCode Predicate) {
	// We're at a tree leaf, produce a conditional comparison operation.
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
	bool isInteger = LHS.getValueType().isInteger();
	if (Negate)
	CC = getSetCCInverse(CC, LHS.getValueType());
	SDLoc DL(Val);
	// Determine OutCC and handle FP special case.
	if (isInteger) {
	OutCC = changeIntCCToAArch64CC(CC);
	} else {
	assert(LHS.getValueType().isFloatingPoint());
	AArch64CC::CondCode ExtraCC;
	changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
	// Some floating point conditions can't be tested with a single condition
	// code. Construct an additional comparison in this case.
	if (ExtraCC != AArch64CC::AL) {
	SDValue ExtraCmp;
	if (!CCOp.getNode())
	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
	else
	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
	ExtraCC, DL, DAG);
	CCOp = ExtraCmp;
	Predicate = ExtraCC;
	}
	}

	// Produce a normal comparison if we are first in the chain
	if (!CCOp)
	return emitComparison(LHS, RHS, CC, DL, DAG);
	// Otherwise produce a ccmp.
	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
	DAG);
	}
	assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");

	bool IsOR = Opcode == ISD::OR;

	SDValue LHS = Val->getOperand(0);
	bool CanNegateL;
	bool MustBeFirstL;
	bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
	assert(ValidL && "Valid conjunction/disjunction tree");
	(void)ValidL;

	SDValue RHS = Val->getOperand(1);
	bool CanNegateR;
	bool MustBeFirstR;
	bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
	assert(ValidR && "Valid conjunction/disjunction tree");
	(void)ValidR;

	// Swap sub-tree that must come first to the right side.
	if (MustBeFirstL) {
	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
	std::swap(LHS, RHS);
	std::swap(CanNegateL, CanNegateR);
	std::swap(MustBeFirstL, MustBeFirstR);
	}

	bool NegateR;
	bool NegateAfterR;
	bool NegateL;
	bool NegateAfterAll;
	if (Opcode == ISD::OR) {
	// Swap the sub-tree that we can negate naturally to the left.
	if (!CanNegateL) {
	assert(CanNegateR && "at least one side must be negatable");
	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
	assert(!Negate);
	std::swap(LHS, RHS);
	NegateR = false;
	NegateAfterR = true;
	} else {
	// Negate the left sub-tree if possible, otherwise negate the result.
	NegateR = CanNegateR;
	NegateAfterR = !CanNegateR;
	}
	NegateL = true;
	NegateAfterAll = !Negate;
	} else {
	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
	assert(!Negate && "Valid conjunction/disjunction tree");

	NegateL = false;
	NegateR = false;
	NegateAfterR = false;
	NegateAfterAll = false;
	}

	// Emit sub-trees.
	AArch64CC::CondCode RHSCC;
	SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
	if (NegateAfterR)
	RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
	SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
	if (NegateAfterAll)
	OutCC = AArch64CC::getInvertedCondCode(OutCC);
	return CmpL;
	}

	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
	/// In some cases this is even possible with OR operations in the expression.
	/// See \ref AArch64CCMP.
	/// \see emitConjunctionRec().
	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC) {
	bool DummyCanNegate;
	bool DummyMustBeFirst;
	if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
	return SDValue();

	return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
	}

	/// @}

	/// Returns how profitable it is to fold a comparison's operand's shift and/or
	/// extension operations.
	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
	auto isSupportedExtend = [&](SDValue V) {
	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
	return true;

	if (V.getOpcode() == ISD::AND)
	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
	uint64_t Mask = MaskCst->getZExtValue();
	return (Mask == 0xFF \|\| Mask == 0xFFFF \|\| Mask == 0xFFFFFFFF);
	}

	return false;
	};

	if (!Op.hasOneUse())
	return 0;

	if (isSupportedExtend(Op))
	return 1;

	unsigned Opc = Op.getOpcode();
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	uint64_t Shift = ShiftCst->getZExtValue();
	if (isSupportedExtend(Op.getOperand(0)))
	return (Shift <= 4) ? 2 : 1;
	EVT VT = Op.getValueType();
	if ((VT == MVT::i32 && Shift <= 31) \|\| (VT == MVT::i64 && Shift <= 63))
	return 1;
	}

	return 0;
	}

	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AArch64cc, SelectionDAG &DAG,
	const SDLoc &dl) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	EVT VT = RHS.getValueType();
	uint64_t C = RHSC->getZExtValue();
	if (!isLegalArithImmed(C)) {
	// Constant does not fit, try adjusting it by one?
	switch (CC) {
	default:
	break;
	case ISD::SETLT:
	case ISD::SETGE:
	if ((VT == MVT::i32 && C != 0x80000000 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0x80000000ULL &&
	isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if ((VT == MVT::i32 && C != 0 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if ((VT == MVT::i32 && C != INT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != INT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if ((VT == MVT::i32 && C != UINT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != UINT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	}
	}
	}

	// Comparisons are canonicalized so that the RHS operand is simpler than the
	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
	// can fold some shift+extend operations on the RHS operand, so swap the
	// operands if that can be done.
	//
	// For example:
	// lsl w13, w11, #1
	// cmp w13, w12
	// can be turned into:
	// cmp w12, w11, lsl #1
	if (!isa<ConstantSDNode>(RHS) \|\|
	!isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
	SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;

	if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}
	}

	SDValue Cmp;
	AArch64CC::CondCode AArch64CC;
	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
	const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);

	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
	// For the i8 operand, the largest immediate is 255, so this can be easily
	// encoded in the compare instruction. For the i16 operand, however, the
	// largest immediate cannot be encoded in the compare.
	// Therefore, use a sign extending load and cmn to avoid materializing the
	// -1 constant. For example,
	// movz w1, #65535
	// ldrh w0, [x0, #0]
	// cmp w0, w1
	// >
	// ldrsh w0, [x0, #0]
	// cmn w0, #1
	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
	// if and only if (sext LHS) == (sext RHS). The checks are in place to
	// ensure both the LHS and RHS are truly zero extended and to make sure the
	// transformation is profitable.
	if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
	LHS.getNode()->hasNUsesOfValue(1, 0)) {
	int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
	SDValue SExt =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
	DAG.getValueType(MVT::i16));
	Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
	RHS.getValueType()),
	CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	}

	if (!Cmp && (RHSC->isNullValue() \|\| RHSC->isOne())) {
	if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
	if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
	AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
	}
	}
	}

	if (!Cmp) {
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
	return Cmp;
	}

	static std::pair<SDValue, SDValue>
	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
	"Unsupported value type");
	SDValue Value, Overflow;
	SDLoc DL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned Opc = 0;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::VS;
	break;
	case ISD::UADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::HS;
	break;
	case ISD::SSUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::VS;
	break;
	case ISD::USUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::LO;
	break;
	// Multiply needs a little bit extra work.
	case ISD::SMULO:
	case ISD::UMULO: {
	CC = AArch64CC::NE;
	bool IsSigned = Op.getOpcode() == ISD::SMULO;
	if (Op.getValueType() == MVT::i32) {
	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	// For a 32 bit multiply with overflow check we want the instruction
	// selector to generate a widening multiply (SMADDL/UMADDL). For that we
	// need to generate the following pattern:
	// (i64 add 0, (i64 mul (i64 sext\|zext i32 %a), (i64 sext\|zext i32 %b))
	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
	DAG.getConstant(0, DL, MVT::i64));
	// On AArch64 the upper 32 bits are always zero extended for a 32 bit
	// operation. We need to clear out the upper 32 bits, because we used a
	// widening multiply that wrote all 64 bits. In the end this should be a
	// noop.
	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
	if (IsSigned) {
	// The signed overflow check requires more than just a simple check for
	// any bit set in the upper 32 bits of the result. These bits could be
	// just the sign bits of a negative number. To perform the overflow
	// check we have to arithmetic shift right the 32nd bit of the result by
	// 31 bits. Then we compare the result to the upper 32 bits.
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
	DAG.getConstant(32, DL, MVT::i64));
	UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
	DAG.getConstant(31, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	// The overflow check for unsigned multiply is easy. We only need to
	// check if any of the upper 32 bits are set. This can be done with a
	// CMP (shifted register). For that we need to generate the following
	// pattern:
	// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
	// For the 64 bit multiply
	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	if (IsSigned) {
	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
	DAG.getConstant(63, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	} // switch (...)

	if (Opc) {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

	// Emit the AArch64 operation with overflow check.
	Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}
	return std::make_pair(Value, Overflow);
	}

	SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned Offset = IsStrict ? 1 : 0;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
	MakeLibCallOptions CallOptions;
	SDValue Result;
	SDLoc dl(Op);
	std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
	CallOptions, dl, Chain);
	return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
	}

	static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
	SDValue Sel = Op.getOperand(0);
	SDValue Other = Op.getOperand(1);
	SDLoc dl(Sel);

	// If the operand is an overflow checking operation, invert the condition
	// code and kill the Not operation. I.e., transform:
	// (xor (overflow_op_bool, 1))
	// -->
	// (csel 1, 0, invert(cc), overflow_op_bool)
	// ... which later gets transformed to just a cset instruction with an
	// inverted condition code, rather than a cset + eor sequence.
	if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
	return SDValue();

	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
	AArch64CC::CondCode CC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}
	// If neither operand is a SELECT_CC, give up.
	if (Sel.getOpcode() != ISD::SELECT_CC)
	std::swap(Sel, Other);
	if (Sel.getOpcode() != ISD::SELECT_CC)
	return Op;

	// The folding we want to perform is:
	// (xor x, (select_cc a, b, cc, 0, -1) )
	// -->
	// (csel x, (xor x, -1), cc ...)
	//
	// The latter will get matched to a CSINV instruction.

	ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
	SDValue LHS = Sel.getOperand(0);
	SDValue RHS = Sel.getOperand(1);
	SDValue TVal = Sel.getOperand(2);
	SDValue FVal = Sel.getOperand(3);

	// FIXME: This could be generalized to non-integer comparisons.
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return Op;

	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	// The values aren't constants, this isn't the pattern we're looking for.
	if (!CFVal \|\| !CTVal)
	return Op;

	// We can commute the SELECT_CC by inverting the condition. This
	// might be needed to make this fit into a CSINV pattern.
	if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}

	// If the constants line up, perform the transform!
	if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	FVal = Other;
	TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
	DAG.getConstant(-1ULL, dl, Other.getValueType()));

	return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
	CCVal, Cmp);
	}

	return Op;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	unsigned Opc;
	bool ExtraOp = false;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid code");
	case ISD::ADDC:
	Opc = AArch64ISD::ADDS;
	break;
	case ISD::SUBC:
	Opc = AArch64ISD::SUBS;
	break;
	case ISD::ADDE:
	Opc = AArch64ISD::ADCS;
	ExtraOp = true;
	break;
	case ISD::SUBE:
	Opc = AArch64ISD::SBCS;
	ExtraOp = true;
	break;
	}

	if (!ExtraOp)
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDLoc dl(Op);
	AArch64CC::CondCode CC;
	// The actual operation that sets the overflow or carry flag.
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);

	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);

	// We use an inverted condition, because the conditional select is inverted
	// too. This will allow it to be selected to a single instruction:
	// CSINC Wd, WZR, WZR, invert(cond).
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
	CCVal, Overflow);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	// Prefetch operands are:
	// 1: Address to prefetch
	// 2: bool isWrite
	// 3: int locality (0 = no locality ... 3 = extreme locality)
	// 4: bool isDataCache
	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();

	bool IsStream = !Locality;
	// When the locality number is set
	if (Locality) {
	// The front-end should have filtered out the out-of-range values
	assert(Locality <= 3 && "Prefetch locality out-of-range");
	// The locality degree is the opposite of the cache speed.
	// Put the number the other way around.
	// The encoding starts at 0 for level 1
	Locality = 3 - Locality;
	}

	// built the mask value encoding the expected behavior.
	unsigned PrfOp = (IsWrite << 4) \| // Load/Store bit
	(!IsData << 3) \| // IsDataCache bit
	(Locality << 1) \| // Cache level bits
	(unsigned)IsStream; // Stream bit
	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

	RTLIB::Libcall LC;
	LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = SrcVal.getValueType();

	if (SrcVT != MVT::f128) {
	// Expand cases where the input is a vector bigger than NEON.
	if (useSVEForFixedLengthVectorVT(SrcVT))
	return SDValue();

	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.
	MakeLibCallOptions CallOptions;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SDValue Result;
	SDLoc dl(Op);
	std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
	CallOptions, dl, Chain);
	return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
	}

	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT InVT = Op.getOperand(0).getValueType();
	EVT VT = Op.getValueType();
	unsigned NumElts = InVT.getVectorNumElements();

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (InVT.getVectorElementType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
	}

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	SDLoc dl(Op);
	SDValue Cv =
	DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
	Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	SDLoc dl(Op);
	MVT ExtVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
	return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
	}

	// Type changing conversions are illegal.
	return Op;
	}

	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);

	if (SrcVal.getValueType().isVector())
	return LowerVectorFP_TO_INT(Op, DAG);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
	assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
	}

	if (SrcVal.getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT \|\|
	Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	MVT CastVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
	InVT.getVectorNumElements());
	In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
	return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	unsigned CastOpc =
	Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	EVT CastVT = VT.changeVectorElementTypeToInteger();
	In = DAG.getNode(CastOpc, dl, CastVT, In);
	return DAG.getNode(Op.getOpcode(), dl, VT, In);
	}

	return Op;
	}

	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorINT_TO_FP(Op, DAG);

	bool IsStrict = Op->isStrictFPOpcode();
	SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (Op.getValueType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
	SDLoc dl(Op);
	return DAG.getNode(
	ISD::FP_ROUND, dl, MVT::f16,
	DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
	DAG.getIntPtrConstant(0, dl));
	}

	// i128 conversions are libcalls.
	if (SrcVal.getValueType() == MVT::i128)
	return SDValue();

	// Other conversions are legal, unless it's to the completely software-based
	// fp128.
	if (Op.getValueType() != MVT::f128)
	return Op;

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
	SelectionDAG &DAG) const {
	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// which returns the values in two S / D registers.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListTy Args;
	ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
	: RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));

	StructType *RetTy = StructType::get(ArgTy, ArgTy);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
	EVT OpVT = Op.getValueType();
	if (OpVT != MVT::f16 && OpVT != MVT::bf16)
	return SDValue();

	assert(Op.getOperand(0).getValueType() == MVT::i16);
	SDLoc DL(Op);

	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
	return SDValue(
	DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	EVT VT = N->getValueType(0);

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Elt : N->op_values()) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::SIGN_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, true);
	}

	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::ZERO_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, false);
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	// The rounding mode is in bits 23:22 of the FPSCR.
	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
	// so that the shift + and get folded into a bitfield extract.
	SDLoc dl(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue FPCR_64 = DAG.getNode(
	ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
	{Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
	Chain = FPCR_64.getValue(1);
	SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
	SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
	DAG.getConstant(1U << 22, dl, MVT::i32));
	SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
	DAG.getConstant(22, dl, MVT::i32));
	SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
	DAG.getConstant(3, dl, MVT::i32));
	return DAG.getMergeValues({AND, Chain}, dl);
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = AArch64ISD::SMULL;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = AArch64ISD::UMULL;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = AArch64ISD::SMULL;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a S/UMULL instruction
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
	if (!isMLA) {
	Op0 = skipExtensionForVectorMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}
	// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
	SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
	int Pattern) {
	return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
	DAG.getTargetConstant(Pattern, DL, MVT::i32));
	}

	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::aarch64_neon_abs: {
	EVT Ty = Op.getValueType();
	if (Ty == MVT::i64) {
	SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
	Op.getOperand(1));
	Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
	return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
	return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
	} else {
	report_fatal_error("Unexpected type for AArch64 NEON intrinic");
	}
	}
	case Intrinsic::aarch64_neon_smax:
	return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umax:
	return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_smin:
	return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umin:
	return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));

	case Intrinsic::aarch64_sve_sunpkhi:
	return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_sunpklo:
	return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_uunpkhi:
	return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_uunpklo:
	return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_clasta_n:
	return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	case Intrinsic::aarch64_sve_clastb_n:
	return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	case Intrinsic::aarch64_sve_lasta:
	return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_lastb:
	return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_rev:
	return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_tbl:
	return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_trn1:
	return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_trn2:
	return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_uzp1:
	return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_uzp2:
	return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_zip1:
	return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_zip2:
	return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_sve_ptrue:
	return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_dupq_lane:
	return LowerDUPQLane(Op, DAG);
	case Intrinsic::aarch64_sve_convert_from_svbool:
	return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_sve_convert_to_svbool: {
	EVT OutVT = Op.getValueType();
	EVT InVT = Op.getOperand(1).getValueType();
	// Return the operand if the cast isn't changing type,
	// i.e. <n x 16 x i1> -> <n x 16 x i1>
	if (InVT == OutVT)
	return Op.getOperand(1);
	// Otherwise, zero the newly introduced lanes.
	SDValue Reinterpret =
	DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
	SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
	SDValue MaskReinterpret =
	DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
	return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
	}

	case Intrinsic::aarch64_sve_insr: {
	SDValue Scalar = Op.getOperand(2);
	EVT ScalarTy = Scalar.getValueType();
	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
	Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);

	return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
	Op.getOperand(1), Scalar);
	}

	case Intrinsic::localaddress: {
	const auto &MF = DAG.getMachineFunction();
	const auto *RegInfo = Subtarget->getRegisterInfo();
	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
	Op.getSimpleValueType());
	}

	case Intrinsic::eh_recoverfp: {
	// FIXME: This needs to be implemented to correctly handle highly aligned
	// stack objects. For now we simply return the incoming FP. Refer D53541
	// for more details.
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return IncomingFPOp;
	}

	case Intrinsic::aarch64_neon_vsri:
	case Intrinsic::aarch64_neon_vsli: {
	EVT Ty = Op.getValueType();

	if (!Ty.isVector())
	report_fatal_error("Unexpected type for aarch64_neon_vsli");

	assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());

	bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
	unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
	return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
	Op.getOperand(3));
	}

	case Intrinsic::aarch64_neon_srhadd:
	case Intrinsic::aarch64_neon_urhadd: {
	bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
	unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
	return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	}
	}

	bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	return ExtVal.getValueType().isScalableVector();
	}

	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
	EVT VT, EVT MemVT,
	SelectionDAG &DAG) {
	assert(VT.isVector() && "VT should be a vector type");
	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);

	SDValue Value = ST->getValue();

	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
	// the word lane which represent the v4i8 subvector. It optimizes the store
	// to:
	//
	// xtn v0.8b, v0.8h
	// str s0, [x0]

	SDValue Undef = DAG.getUNDEF(MVT::i16);
	SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
	{Undef, Undef, Undef, Undef});

	SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
	Value, UndefVec);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);

	Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
	SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
	Trunc, DAG.getConstant(0, DL, MVT::i64));

	return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
	ST->getBasePtr(), ST->getMemOperand());
	}

	// Custom lowering for any store, vector or scalar and/or default or with
	// a truncate operations. Currently only custom lower truncate operation
	// from vector v4i16 to v4i8 or volatile stores of i128.
	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc Dl(Op);
	StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
	assert (StoreNode && "Can only custom lower store nodes");

	SDValue Value = StoreNode->getValue();

	EVT VT = Value.getValueType();
	EVT MemVT = StoreNode->getMemoryVT();

	if (VT.isVector()) {
	if (useSVEForFixedLengthVectorVT(VT))
	return LowerFixedLengthVectorStoreToSVE(Op, DAG);

	unsigned AS = StoreNode->getAddressSpace();
	Align Alignment = StoreNode->getAlign();
	if (Alignment < MemVT.getStoreSize() &&
	!allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
	StoreNode->getMemOperand()->getFlags(),
	nullptr)) {
	return scalarizeVectorStore(StoreNode, DAG);
	}

	if (StoreNode->isTruncatingStore()) {
	return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
	}
	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
	// the custom lowering, as there are no un-paired non-temporal stores and
	// legalization will break up 256 bit inputs.
	if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
	MemVT.getVectorElementCount().Min % 2u == 0 &&
	((MemVT.getScalarSizeInBits() == 8u \|\|
	MemVT.getScalarSizeInBits() == 16u \|\|
	MemVT.getScalarSizeInBits() == 32u \|\|
	MemVT.getScalarSizeInBits() == 64u))) {
	SDValue Lo =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
	StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
	SDValue Hi = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, Dl,
	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
	StoreNode->getValue(),
	DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
	SDValue Result = DAG.getMemIntrinsicNode(
	AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
	{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
	StoreNode->getMemoryVT(), StoreNode->getMemOperand());
	return Result;
	}
	} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
	assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
	SDValue Lo =
	DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
	DAG.getConstant(0, Dl, MVT::i64));
	SDValue Hi =
	DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
	DAG.getConstant(1, Dl, MVT::i64));
	SDValue Result = DAG.getMemIntrinsicNode(
	AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
	{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
	StoreNode->getMemoryVT(), StoreNode->getMemOperand());
	return Result;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	LLVM_DEBUG(dbgs() << "Custom lowering: ");
	LLVM_DEBUG(Op.dump());

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unimplemented operand");
	return SDValue();
	case ISD::BITCAST:
	return LowerBITCAST(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress:
	return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SETCC:
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS:
	return LowerSETCC(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT:
	return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::JumpTable:
	return LowerJumpTable(Op, DAG);
	case ISD::BR_JT:
	return LowerBR_JT(Op, DAG);
	case ISD::ConstantPool:
	return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);
	case ISD::VAARG:
	return LowerVAARG(Op, DAG);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE:
	return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	return LowerXALUO(Op, DAG);
	case ISD::FADD:
	if (useSVEForFixedLengthVectorVT(Op.getValueType()))
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
	return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
	case ISD::FSUB:
	return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
	case ISD::FMUL:
	return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
	case ISD::FMA:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
	case ISD::FDIV:
	return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND:
	return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND:
	return LowerFP_EXTEND(Op, DAG);
	case ISD::FRAMEADDR:
	return LowerFRAMEADDR(Op, DAG);
	case ISD::SPONENTRY:
	return LowerSPONENTRY(Op, DAG);
	case ISD::RETURNADDR:
	return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR:
	return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::BUILD_VECTOR:
	return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::SPLAT_VECTOR:
	return LowerSPLAT_VECTOR(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR:
	return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::INSERT_SUBVECTOR:
	return LowerINSERT_SUBVECTOR(Op, DAG);
	case ISD::SDIV:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
	case ISD::UDIV:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
	case ISD::SMIN:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
	case ISD::UMIN:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
	case ISD::SMAX:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
	case ISD::UMAX:
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL:
	return LowerVectorSRA_SRL_SHL(Op, DAG);
	case ISD::SHL_PARTS:
	return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	return LowerShiftRightParts(Op, DAG);
	case ISD::CTPOP:
	return LowerCTPOP(Op, DAG);
	case ISD::FCOPYSIGN:
	return LowerFCOPYSIGN(Op, DAG);
	case ISD::OR:
	return LowerVectorOR(Op, DAG);
	case ISD::XOR:
	return LowerXOR(Op, DAG);
	case ISD::PREFETCH:
	return LowerPREFETCH(Op, DAG);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::STRICT_FP_TO_UINT:
	return LowerFP_TO_INT(Op, DAG);
	case ISD::FSINCOS:
	return LowerFSINCOS(Op, DAG);
	case ISD::FLT_ROUNDS_:
	return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::MUL:
	return LowerMUL(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::STORE:
	return LowerSTORE(Op, DAG);
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	return LowerVECREDUCE(Op, DAG);
	case ISD::ATOMIC_LOAD_SUB:
	return LowerATOMIC_LOAD_SUB(Op, DAG);
	case ISD::ATOMIC_LOAD_AND:
	return LowerATOMIC_LOAD_AND(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC:
	return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::VSCALE:
	return LowerVSCALE(Op, DAG);
	case ISD::TRUNCATE:
	return LowerTRUNCATE(Op, DAG);
	case ISD::LOAD:
	if (useSVEForFixedLengthVectorVT(Op.getValueType()))
	return LowerFixedLengthVectorLoadToSVE(Op, DAG);
	llvm_unreachable("Unexpected request to lower ISD::LOAD");
	case ISD::ADD:
	if (useSVEForFixedLengthVectorVT(Op.getValueType()))
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
	llvm_unreachable("Unexpected request to lower ISD::ADD");
	}
	}

	bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
	// Prefer NEON unless larger SVE registers are available.
	return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
	}

	bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
	if (!useSVEForFixedLengthVectors())
	return false;

	if (!VT.isFixedLengthVector())
	return false;

	// Fixed length predicates should be promoted to i8.
	// NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
	if (VT.getVectorElementType() == MVT::i1)
	return false;

	// Don't use SVE for vectors we cannot scalarize if required.
	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
	default:
	return false;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	case MVT::i64:
	case MVT::f16:
	case MVT::f32:
	case MVT::f64:
	break;
	}

	// Ensure NEON MVTs only belong to a single register class.
	if (VT.getSizeInBits() <= 128)
	return false;

	// Don't use SVE for types that don't fit.
	if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
	return false;

	// TODO: Perhaps an artificial restriction, but worth having whilst getting
	// the base fixed length SVE support in place.
	if (!VT.isPow2VectorType())
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) const {
	switch (CC) {
	default:
	report_fatal_error("Unsupported calling convention.");
	case CallingConv::WebKit_JS:
	return CC_AArch64_WebKit_JS;
	case CallingConv::GHC:
	return CC_AArch64_GHC;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::PreserveMost:
	case CallingConv::CXX_FAST_TLS:
	case CallingConv::Swift:
	if (Subtarget->isTargetWindows() && IsVarArg)
	return CC_AArch64_Win64_VarArg;
	if (!Subtarget->isTargetDarwin())
	return CC_AArch64_AAPCS;
	if (!IsVarArg)
	return CC_AArch64_DarwinPCS;
	return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
	: CC_AArch64_DarwinPCS_VarArg;
	case CallingConv::Win64:
	return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
	case CallingConv::CFGuard_Check:
	return CC_AArch64_Win64_CFGuard_Check;
	case CallingConv::AArch64_VectorCall:
	case CallingConv::AArch64_SVE_VectorCall:
	return CC_AArch64_AAPCS;
	}
	}

	CCAssignFn *
	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	}

	SDValue AArch64TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	DenseMap<unsigned, SDValue> CopiedRegs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// At this point, Ins[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Ins.size();
	Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Ins[i].VT;
	if (Ins[i].isOrigArg()) {
	std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();

	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;
	}
	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res =
	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	assert(ArgLocs.size() == Ins.size());
	SmallVector<SDValue, 16> ArgValues;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[i].Flags.isByVal()) {
	// Byval is used for HFAs in the PCS, but the system should work in a
	// non-compliant manner for larger structs.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	int Size = Ins[i].Flags.getByValSize();
	unsigned NumRegs = (Size + 7) / 8;

	// FIXME: This works on big-endian for composite byvals, which are the common
	// case. It should also work for fundamental types too.
	unsigned FrameIdx =
	MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
	SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
	InVals.push_back(FrameIdxN);

	continue;
	}

	SDValue ArgValue;
	if (VA.isRegLoc()) {
	// Arguments stored in registers.
	EVT RegVT = VA.getLocVT();
	const TargetRegisterClass *RC;

	if (RegVT == MVT::i32)
	RC = &AArch64::GPR32RegClass;
	else if (RegVT == MVT::i64)
	RC = &AArch64::GPR64RegClass;
	else if (RegVT == MVT::f16 \|\| RegVT == MVT::bf16)
	RC = &AArch64::FPR16RegClass;
	else if (RegVT == MVT::f32)
	RC = &AArch64::FPR32RegClass;
	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
	RC = &AArch64::FPR64RegClass;
	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
	RC = &AArch64::FPR128RegClass;
	else if (RegVT.isScalableVector() &&
	RegVT.getVectorElementType() == MVT::i1)
	RC = &AArch64::PPRRegClass;
	else if (RegVT.isScalableVector())
	RC = &AArch64::ZPRRegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	// If this is an 8, 16 or 32-bit value, it is really passed promoted
	// to 64 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::Indirect:
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::AExt:
	case CCValAssign::SExt:
	case CCValAssign::ZExt:
	break;
	case CCValAssign::AExtUpper:
	ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
	DAG.getConstant(32, DL, RegVT));
	ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
	break;
	}
	} else { // VA.isRegLoc()
	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
	? VA.getLocVT().getSizeInBits()
	: VA.getValVT().getSizeInBits()) / 8;

	uint32_t BEAlign = 0;
	if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
	!Ins[i].Flags.isInConsecutiveRegs())
	BEAlign = 8 - ArgSize;

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::Trunc:
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::Indirect:
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, DL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);

	}

	if (VA.getLocInfo() == CCValAssign::Indirect) {
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	// If value is passed via pointer - do a load.
	ArgValue =
	DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
	}

	if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
	ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
	ArgValue, DAG.getValueType(MVT::i32));
	InVals.push_back(ArgValue);
	}

	// varargs
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	if (isVarArg) {
	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
	// The AAPCS variadic function ABI is identical to the non-variadic
	// one. As a result there may be more arguments in registers and we should
	// save them for future reference.
	// Win64 variadic functions also pass arguments in registers, but all float
	// arguments are passed in integer registers.
	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
	}

	// This will point to the next argument passed via stack.
	unsigned StackOffset = CCInfo.getNextStackOffset();
	// We currently pass all varargs at 8-byte alignment, or 4 for ILP32
	StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
	FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));

	if (MFI.hasMustTailInVarArgFunc()) {
	SmallVector<MVT, 2> RegParmTypes;
	RegParmTypes.push_back(MVT::i64);
	RegParmTypes.push_back(MVT::f128);
	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
	CC_AArch64_AAPCS);

	// Conservatively forward X8, since it might be used for aggregate return.
	if (!CCInfo.isAllocated(AArch64::X8)) {
	unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
	Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
	}
	}
	}

	// On Windows, InReg pointers must be returned, so record the pointer in a
	// virtual register at the start of the function so it can be returned in the
	// epilogue.
	if (IsWin64) {
	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	if (Ins[I].Flags.isInReg()) {
	assert(!FuncInfo->getSRetReturnReg());

	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Register Reg =
	MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);

	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
	break;
	}
	}
	}

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
	// This is a non-standard ABI so by fiat I say we're allowed to make full
	// use of the stack area to be popped, which must be aligned to 16 bytes in
	// any case:
	StackArgSize = alignTo(StackArgSize, 16);

	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
	// a multiple of 16.
	FuncInfo->setArgumentStackToRestore(StackArgSize);

	// This realignment carries over to the available bytes below. Our own
	// callers will guarantee the space is free by giving an aligned value to
	// CALLSEQ_START.
	}
	// Even if we're not expected to free up the space, it's useful to know how
	// much is there while considering tail calls (because we can reuse it).
	FuncInfo->setBytesInStackArgArea(StackArgSize);

	if (Subtarget->hasCustomCallingConv())
	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

	return Chain;
	}

	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
	SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue &Chain) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	SmallVector<SDValue, 8> MemOps;

	static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
	AArch64::X3, AArch64::X4, AArch64::X5,
	AArch64::X6, AArch64::X7 };
	static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);

	unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
	int GPRIdx = 0;
	if (GPRSaveSize != 0) {
	if (IsWin64) {
	GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
	if (GPRSaveSize & 15)
	// The extra size here, if triggered, will always be 8.
	MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
	} else
	GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);

	SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	IsWin64
	? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	GPRIdx,
	(i - FirstVariadicGPR) * 8)
	: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
	MemOps.push_back(Store);
	FIN =
	DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsGPRIndex(GPRIdx);
	FuncInfo->setVarArgsGPRSize(GPRSaveSize);

	if (Subtarget->hasFPARMv8() && !IsWin64) {
	static const MCPhysReg FPRArgRegs[] = {
	AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
	AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
	static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);

	unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
	int FPRIdx = 0;
	if (FPRSaveSize != 0) {
	FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);

	SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);

	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);

	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getConstant(16, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsFPRIndex(FPRIdx);
	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
	}

	if (!MemOps.empty()) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue AArch64TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	DenseMap<unsigned, SDValue> CopiedRegs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	// Avoid copying a physreg twice since RegAllocFast is incompetent and only
	// allows one use of a physreg per block.
	SDValue Val = CopiedRegs.lookup(VA.getLocReg());
	if (!Val) {
	Val =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);
	CopiedRegs[VA.getLocReg()] = Val;
	}

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::AExtUpper:
	Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
	DAG.getConstant(32, DL, VA.getLocVT()));
	LLVM_FALLTHROUGH;
	case CCValAssign::AExt:
	LLVM_FALLTHROUGH;
	case CCValAssign::ZExt:
	Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	+ case CallingConv::AArch64_SVE_VectorCall:
	case CallingConv::PreserveMost:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF.getCallingConv();
	+
	+ // If this function uses the C calling convention but has an SVE signature,
	+ // then it preserves more registers and should assume the SVE_VectorCall CC.
	+ // The check for matching callee-saved regs will determine whether it is
	+ // eligible for TCO.
	+ if (CallerCC == CallingConv::C &&
	+ AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
	+ CallerCC = CallingConv::AArch64_SVE_VectorCall;
	+
	bool CCMatch = CallerCC == CalleeCC;

	// When using the Windows calling convention on a non-windows OS, we want
	// to back up and restore X18 in such functions; we can't do a tail call
	// from those functions.
	if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
	CalleeCC != CallingConv::Win64)
	return false;

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible (see
	// X86) but less efficient and uglier in LowerCall.
	for (Function::const_arg_iterator i = CallerF.arg_begin(),
	e = CallerF.arg_end();
	i != e; ++i) {
	if (i->hasByValAttr())
	return false;

	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
	// In this case, it is necessary to save/restore X0 in the callee. Tail
	// call opt interferes with this. So we disable tail call opt when the
	// caller has an argument with "inreg" attribute.

	// FIXME: Check whether the callee also has an "inreg" argument.
	if (i->hasInRegAttr())
	return false;
	}

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	return canGuaranteeTCO(CalleeCC) && CCMatch;

	// Externally-defined functions with weak linkage should not be
	// tail-called on AArch64 when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Now we search for cases where we can use a tail call without changing the
	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
	// concept.

	// I want anyone implementing a new calling convention to think long and hard
	// about this assert.
	assert((!isVarArg \|\| CalleeCC == CallingConv::C) &&
	"Unexpected variadic calling convention");

	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// At least two cases here: if caller is fastcc then we can't have any
	// memory arguments (we'd be expected to clean up the stack afterwards). If
	// caller is C then we could potentially use its argument area.

	// FIXME: for now we take the most conservative of these in both cases:
	// disallow all variadic memory operands.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
	for (const CCValAssign &ArgLoc : ArgLocs)
	if (!ArgLoc.isRegLoc())
	return false;
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForCall(CalleeCC, isVarArg),
	CCAssignFnForCall(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (Subtarget->hasCustomCallingConv()) {
	TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
	TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
	}
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	// If any of the arguments is passed indirectly, it must be SVE, so the
	// 'getBytesInStackArgArea' is not sufficient to determine whether we need to
	// allocate space on the stack. That is why we determine this explicitly here
	// the call cannot be a tailcall.
	if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
	assert((A.getLocInfo() != CCValAssign::Indirect \|\|
	A.getValVT().isScalableVector()) &&
	"Expected value to be scalable");
	return A.getLocInfo() == CCValAssign::Indirect;
	}))
	return false;

	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;

	return true;
	}

	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
	bool TailCallOpt) const {
	return CallCC == CallingConv::Fast && TailCallOpt;
	}

	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
	/// and add input and output parameter nodes.
	SDValue
	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFunction::CallSiteInfo CSInfo;
	bool IsThisReturn = false;

	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	bool IsSibCall = false;

	+ // Check callee args/returns for SVE registers and set calling convention
	+ // accordingly.
	+ if (CallConv == CallingConv::C) {
	+ bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
	+ return Out.VT.isScalableVector();
	+ });
	+ bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
	+ return In.VT.isScalableVector();
	+ });
	+
	+ if (CalleeInSVE \|\| CalleeOutSVE)
	+ CallConv = CallingConv::AArch64_SVE_VectorCall;
	+ }
	+
	if (IsTailCall) {
	// Check if it's really possible to do a tail call.
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
	/IsVarArg=/ !Outs[i].IsFixed);
	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	} else {
	// At this point, Outs[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeCallOperands to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Outs.size();
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Outs[i].VT;
	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(),
	CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;

	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int FPDiff = 0;

	if (IsTailCall && !IsSibCall) {
	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();

	// Since callee will pop argument stack as a tail call, we must keep the
	// popped size 16-byte aligned.
	NumBytes = alignTo(NumBytes, 16);

	// FPDiff will be negative if this tail call requires more space than we
	// would automatically have in our incoming argument space. Positive if we
	// can actually shrink the stack.
	FPDiff = NumReusableBytes - NumBytes;

	// The stack pointer must be 16-byte aligned at all times it's used for a
	// memory operation, which in practice means at all times and in
	// particular across call boundaries. Therefore our own arguments started at
	// a 16-byte aligned SP and the delta applied for the tail call should
	// satisfy the same constraint.
	assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
	}

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
	getPointerTy(DAG.getDataLayout()));

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallSet<unsigned, 8> RegsUsed;
	SmallVector<SDValue, 8> MemOpChains;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
	RegsToPass.emplace_back(F.PReg, Val);
	}
	}

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
	}
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExtUpper:
	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
	DAG.getConstant(32, DL, VA.getLocVT()));
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(VA.getLocVT(), Arg);
	break;
	case CCValAssign::Trunc:
	Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::Indirect:
	assert(VA.getValVT().isScalableVector() &&
	"Only scalable vectors can be passed indirectly");
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	Type Ty = EVT(VA.getValVT()).getTypeForEVT(DAG.getContext());
	Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
	int FI = MFI.CreateStackObject(
	VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
	MFI.setStackID(FI, TargetStackID::SVEVector);

	SDValue SpillSlot = DAG.getFrameIndex(
	FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
	Chain = DAG.getStore(
	Chain, DL, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}

	if (VA.isRegLoc()) {
	if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
	Outs[0].VT == MVT::i64) {
	assert(VA.getLocVT() == MVT::i64 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
	"unexpected use of 'returned'");
	IsThisReturn = true;
	}
	if (RegsUsed.count(VA.getLocReg())) {
	// If this register has already been used then we're trying to pack
	// parts of an [N x i32] into an X-register. The extension type will
	// take care of putting the two halves in the right place but we have to
	// combine them.
	SDValue &Bits =
	std::find_if(RegsToPass.begin(), RegsToPass.end(),
	[=](const std::pair<unsigned, SDValue> &Elt) {
	return Elt.first == VA.getLocReg();
	})
	->second;
	Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
	// Call site info is used for function's parameter entry value
	// tracking. For now we track only simple cases when parameter
	// is transferred through whole register.
	CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
	[&VA](MachineFunction::ArgRegPair ArgReg) {
	return ArgReg.Reg == VA.getLocReg();
	}),
	CSInfo.end());
	} else {
	RegsToPass.emplace_back(VA.getLocReg(), Arg);
	RegsUsed.insert(VA.getLocReg());
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EmitCallSiteInfo)
	CSInfo.emplace_back(VA.getLocReg(), i);
	}
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	// FIXME: This works on big-endian for composite byvals, which are the
	// common case. It should also work for fundamental types too.
	uint32_t BEAlign = 0;
	unsigned OpSize;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	OpSize = VA.getLocVT().getSizeInBits();
	else
	OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
	: VA.getValVT().getSizeInBits();
	OpSize = (OpSize + 7) / 8;
	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
	!Flags.isInConsecutiveRegs()) {
	if (OpSize < 8)
	BEAlign = 8 - OpSize;
	}
	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset + BEAlign;
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

	if (IsTailCall) {
	Offset = Offset + FPDiff;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getFrameIndex(FI, PtrVT);
	DstInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.
	Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
	} else {
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);

	DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
	LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode,
	Outs[i].Flags.getNonZeroByValAlign(),
	/isVol = / false, /AlwaysInline = / false,
	/isTailCall = / false, DstInfo, MachinePointerInfo());

	MemOpChains.push_back(Cpy);
	} else {
	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
	// promoted to a legal register type i32, we should truncate Arg back to
	// i1/i8/i16.
	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
	VA.getValVT() == MVT::i16)
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	auto GV = G->getGlobal();
	unsigned OpFlags =
	Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
	if (OpFlags & AArch64II::MO_GOT) {
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const GlobalValue *GV = G->getGlobal();
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	}
	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Subtarget->isTargetMachO()) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
	}
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass)
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));

	- // Check callee args/returns for SVE registers and set calling convention
	- // accordingly.
	- if (CallConv == CallingConv::C) {
	- bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
	- return Out.VT.isScalableVector();
	- });
	- bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
	- return In.VT.isScalableVector();
	- });
	-
	- if (CalleeInSVE \|\| CalleeOutSVE)
	- CallConv = CallingConv::AArch64_SVE_VectorCall;
	- }
	-
	// Add a register mask operand representing the call-preserved registers.
	const uint32_t *Mask;
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (IsThisReturn) {
	// For 'this' returns, use the X0-preserving mask if applicable
	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	IsThisReturn = false;
	Mask = TRI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = TRI->getCallPreservedMask(MF, CallConv);

	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(MF, &Mask);

	if (TRI->isAnyArgRegReserved(MF))
	TRI->emitReservedArgRegCallError(MF);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
	DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
	InFlag = Chain.getValue(1);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	uint64_t CalleePopBytes =
	DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(CalleePopBytes, DL, true),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	bool AArch64TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC);
	}

	SDValue
	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	auto &MF = DAG.getMachineFunction();
	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC);

	// Copy the result values into the output registers.
	SDValue Flag;
	SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
	SmallSet<unsigned, 4> RegsUsed;
	for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue Arg = OutVals[realRVLocIdx];

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
	// value. This is strictly redundant on Darwin (which uses "zeroext
	// i1"), but will be optimised out before ISel.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	case CCValAssign::ZExt:
	Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
	break;
	case CCValAssign::AExtUpper:
	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
	Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
	Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
	DAG.getConstant(32, DL, VA.getLocVT()));
	break;
	}

	if (RegsUsed.count(VA.getLocReg())) {
	SDValue &Bits =
	std::find_if(RetVals.begin(), RetVals.end(),
	[=](const std::pair<unsigned, SDValue> &Elt) {
	return Elt.first == VA.getLocReg();
	})
	->second;
	Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
	} else {
	RetVals.emplace_back(VA.getLocReg(), Arg);
	RegsUsed.insert(VA.getLocReg());
	}
	}

	SmallVector<SDValue, 4> RetOps(1, Chain);
	for (auto &RetVal : RetVals) {
	Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(
	DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
	}

	// Windows AArch64 ABIs require that for returning structs by value we copy
	// the sret argument into X0 for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into X0.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg = AArch64::X0;
	Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
	}

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AArch64::GPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AArch64::FPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
	}

	// (loadGOT sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes instead of using a wrapper node.
	return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
	}

	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, Ty,
	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
	}

	// (addlow (adrp %hi(sym)) %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
	SDValue Lo = getTargetNode(N, Ty, DAG,
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
	}

	// (adr sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
	return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
	}

	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GN->getGlobal();
	unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());

	if (OpFlags != AArch64II::MO_NO_FLAG)
	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
	"unexpected offset in global node");

	// This also catches the large code model case for Darwin, and tiny code
	// model with got relocations.
	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	return getGOT(GN, DAG, OpFlags);
	}

	SDValue Result;
	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	Result = getAddrLarge(GN, DAG, OpFlags);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	Result = getAddrTiny(GN, DAG, OpFlags);
	} else {
	Result = getAddr(GN, DAG, OpFlags);
	}
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(GN);
	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	}

	/// Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address (for Darwin, currently) and
	/// return an SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i64] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first xword, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "x0".
	///
	/// Since this descriptor may be in a different unit, in general even the
	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
	/// is:
	/// adrp x0, _var@TLVPPAGE
	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
	/// ; the function pointer
	/// blr x1 ; Uses descriptor address in x0
	/// ; Address of _var is now in x0.
	///
	/// If the address of _var's descriptor is known to the linker, then it can
	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
	/// a slight efficiency gain.
	SDValue
	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"This function expects a Darwin target");

	SDLoc DL(Op);
	MVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	SDValue TLVPAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	PtrMemVT, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ PtrMemVT.getSizeInBits() / 8,
	MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
	Chain = FuncTLVGet.getValue(1);

	// Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
	FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
	// silly).
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: x0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
	}

	/// Convert a thread-local variable reference into a sequence of instructions to
	/// compute the variable's address for the local exec TLS model of ELF targets.
	/// The sequence depends on the maximum TLS area size.
	SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
	SDValue ThreadBase,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue TPOff, Addr;

	switch (DAG.getTarget().Options.TLSSize) {
	default:
	llvm_unreachable("Unexpected TLS size");

	case 12: {
	// mrs x0, TPIDR_EL0
	// add x0, x0, :tprel_lo12:a
	SDValue Var = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF);
	return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	Var,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	}

	case 24: {
	// mrs x0, TPIDR_EL0
	// add x0, x0, :tprel_hi12:a
	// add x0, x0, :tprel_lo12_nc:a
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
	LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	}

	case 32: {
	// mrs x1, TPIDR_EL0
	// movz x0, #:tprel_g1:a
	// movk x0, #:tprel_g0_nc:a
	// add x0, x1, x0
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_G1);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
	DAG.getTargetConstant(16, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	case 48: {
	// mrs x1, TPIDR_EL0
	// movz x0, #:tprel_g2:a
	// movk x0, #:tprel_g1_nc:a
	// movk x0, #:tprel_g0_nc:a
	// add x0, x1, x0
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_G2);
	SDValue MiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_G1 \| AArch64II::MO_NC);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
	DAG.getTargetConstant(32, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
	DAG.getTargetConstant(16, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}
	}
	}

	/// When accessing thread-local variables under either the general-dynamic or
	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
	/// is a function pointer to carry out the resolution.
	///
	/// The sequence is:
	/// adrp x0, :tlsdesc:var
	/// ldr x1, [x0, #:tlsdesc_lo12:var]
	/// add x0, x0, #:tlsdesc_lo12:var
	/// .tlsdesccall var
	/// blr x1
	/// (TPIDR_EL0 offset now in x0)
	///
	/// The above sequence must be produced unscheduled, to enable the linker to
	/// optimize/relax this sequence.
	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
	/// above sequence, and expanded really late in the compilation flow, to ensure
	/// the sequence is produced as per above.
	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	Chain =
	DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
	SDValue Glue = Chain.getValue(1);

	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
	}

	SDValue
	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetELF() && "This function expects an ELF target");

	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());

	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
	if (Model == TLSModel::LocalDynamic)
	Model = TLSModel::GeneralDynamic;
	}

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Model != TLSModel::LocalExec)
	report_fatal_error("ELF TLS only supported in small memory model or "
	"in local exec TLS model");
	// Different choices can be made for the maximum size of the TLS area for a
	// module. For the small address model, the default TLS size is 16MiB and the
	// maximum TLS size is 4GiB.
	// FIXME: add tiny and large code model support for TLS access models other
	// than local exec. We currently generate the same code as small for tiny,
	// which may be larger than needed.

	SDValue TPOff;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalValue *GV = GA->getGlobal();

	SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

	if (Model == TLSModel::LocalExec) {
	return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
	} else if (Model == TLSModel::InitialExec) {
	TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
	} else if (Model == TLSModel::LocalDynamic) {
	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
	// the beginning of the module's TLS region, followed by a DTPREL offset
	// calculation.

	// These accesses will need deduplicating if there's more than one.
	AArch64FunctionInfo *MFI =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
	AArch64II::MO_TLS);

	// Now we can calculate the offset from TPIDR_EL0 to this module's
	// thread-local area.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

	// Now use :dtprel_whatever: operations to calculate this variable's offset
	// in its thread-storage area.
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	} else if (Model == TLSModel::GeneralDynamic) {
	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

	// Finally we can make a call to calculate the offset from tpidr_el0.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
	} else
	llvm_unreachable("Unsupported ELF TLS access model");

	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	SDValue
	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");

	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);

	// Load the ThreadLocalStoragePointer from the TEB
	// A pointer to the TLS array is located at offset 0x58 from the TEB.
	SDValue TLSArray =
	DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
	TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
	Chain = TLSArray.getValue(1);

	// Load the TLS index from the C runtime;
	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
	// This also does the same as LOADgot, but using a generic i32 load,
	// while LOADgot only loads i64.
	SDValue TLSIndexHi =
	DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
	"_tls_index", PtrVT, AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
	SDValue TLSIndex =
	DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
	TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
	Chain = TLSIndex.getValue(1);

	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
	// offset into the TLSArray.
	TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
	SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
	DAG.getConstant(3, DL, PtrVT));
	SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
	DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
	MachinePointerInfo());
	Chain = TLS.getValue(1);

	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GA->getGlobal();
	SDValue TGAHi = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue TGALo = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	// Add the offset from the start of the .tls section (section base).
	SDValue Addr =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
	return Addr;
	}

	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	if (Subtarget->isTargetDarwin())
	return LowerDarwinGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetELF())
	return LowerELFGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetWindows())
	return LowerWindowsGlobalTLSAddress(Op, DAG);

	llvm_unreachable("Unexpected platform trying to use TLS");
	}

	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	bool ProduceNonFlagSettingCondBr =
	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);

	// Handle f128 first, since lowering it will result in comparing the return
	// value of a libcall against zero, which is just what the rest of LowerBR_CC
	// is expecting to deal with.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);

	if (CC == ISD::SETNE)
	OFCC = getInvertedCondCode(OFCC);
	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);

	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	// If the RHS of the comparison is zero, we can potentially fold this
	// to a specialized branch.
	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
	if (CC == ISD::SETEQ) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETNE) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}
	}
	if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}

	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::bf16 \|\|
	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue BR1 =
	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
	Cmp);
	}

	return BR1;
	}

	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue In1 = Op.getOperand(0);
	SDValue In2 = Op.getOperand(1);
	EVT SrcVT = In2.getValueType();

	if (SrcVT.bitsLT(VT))
	In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
	else if (SrcVT.bitsGT(VT))
	In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));

	EVT VecVT;
	uint64_t EltMask;
	SDValue VecVal1, VecVal2;

	auto setVecVal = [&] (int Idx) {
	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	};

	if (VT == MVT::f32 \|\| VT == MVT::v2f32 \|\| VT == MVT::v4f32) {
	VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
	EltMask = 0x80000000ULL;
	setVecVal(AArch64::ssub);
	} else if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	VecVT = MVT::v2i64;

	// We want to materialize a mask with the high bit set, but the AdvSIMD
	// immediate moves cannot materialize that in a single instruction for
	// 64-bit elements. Instead, materialize zero and then negate it.
	EltMask = 0;

	setVecVal(AArch64::dsub);
	} else if (VT == MVT::f16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8f16) {
	VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
	EltMask = 0x8000ULL;
	setVecVal(AArch64::hsub);
	} else {
	llvm_unreachable("Invalid type for copysign!");
	}

	SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);

	// If we couldn't materialize the mask above, then the mask vector will be
	// the zero vector, and we need to negate it here.
	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
	}

	SDValue Sel =
	DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);

	if (VT == MVT::f16)
	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
	if (VT == MVT::f32)
	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
	else if (VT == MVT::f64)
	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
	else
	return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
	}

	SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat))
	return SDValue();

	if (!Subtarget->hasNEON())
	return SDValue();

	// While there is no integer popcount instruction, it can
	// be more efficiently lowered to the following sequence that uses
	// AdvSIMD registers/instructions as long as the copies to/from
	// the AdvSIMD registers are cheap.
	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
	// CNT V0.8B, V0.8B // 8xbyte pop-counts
	// ADDV B0, V0.8B // sum 8xbyte pop-counts
	// UMOV X0, V0.B[0] // copy byte result back to integer reg
	SDValue Val = Op.getOperand(0);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	if (VT == MVT::i32)
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;
	} else if (VT == MVT::i128) {
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
	}

	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
	"Unexpected type for custom ctpop lowering");

	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
	Val = DAG.getBitcast(VT8Bit, Val);
	Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);

	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
	unsigned EltSize = 8;
	unsigned NumElts = VT.is64BitVector() ? 8 : 16;
	while (EltSize != VT.getScalarSizeInBits()) {
	EltSize *= 2;
	NumElts /= 2;
	MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
	Val = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
	}

	return Val;
	}

	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	if (Op.getValueType().isVector())
	return LowerVSETCC(Op, DAG);

	bool IsStrict = Op->isStrictFPOpcode();
	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Chain;
	if (IsStrict)
	Chain = Op.getOperand(0);
	SDValue LHS = Op.getOperand(OpNo + 0);
	SDValue RHS = Op.getOperand(OpNo + 1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
	SDLoc dl(Op);

	// We chose ZeroOrOneBooleanContents, so use zero and one.
	EVT VT = Op.getValueType();
	SDValue TVal = DAG.getConstant(1, dl, VT);
	SDValue FVal = DAG.getConstant(0, dl, VT);

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets picked up by the next if statement.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
	IsSignaling);

	// If softenSetCCOperands returned a scalar, use it.
	if (!RHS.getNode()) {
	assert(LHS.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
	}
	}

	if (LHS.getValueType().isInteger()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(
	LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
	return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);

	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
	// and do the comparison.
	SDValue Cmp;
	if (IsStrict)
	Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
	else
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue Res;
	if (CC2 == AArch64CC::AL) {
	changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
	CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
	} else {
	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
	// totally clean. Some of them require two CSELs to implement. As is in
	// this case, we emit the first CSEL and then emit a second using the output
	// of the first as the RHS. We're effectively OR'ing the two CC's together.

	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}
	return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
	SDValue RHS, SDValue TVal,
	SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const {
	// Handle f128 first, because it will result in a comparison of some RTLIB
	// call result against zero.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Also handle f16, for which we need to do a f32 comparison.
	if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	}

	// Next, handle integers.
	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	unsigned Opcode = AArch64ISD::CSEL;

	// If both the TVal and the FVal are constants, see if we can swap them in
	// order to for a CSINV or CSINC out of them.
	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	} else if (TVal.getOpcode() == ISD::XOR) {
	// If TVal is a NOT we want to swap TVal and FVal so that we can match
	// with a CSINV rather than a CSEL.
	if (isAllOnesConstant(TVal.getOperand(1))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}
	} else if (TVal.getOpcode() == ISD::SUB) {
	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
	// that we can match with a CSNEG rather than a CSEL.
	if (isNullConstant(TVal.getOperand(0))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}
	} else if (CTVal && CFVal) {
	const int64_t TrueVal = CTVal->getSExtValue();
	const int64_t FalseVal = CFVal->getSExtValue();
	bool Swap = false;

	// If both TVal and FVal are constants, see if FVal is the
	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
	// instead of a CSEL in that case.
	if (TrueVal == ~FalseVal) {
	Opcode = AArch64ISD::CSINV;
	} else if (TrueVal == -FalseVal) {
	Opcode = AArch64ISD::CSNEG;
	} else if (TVal.getValueType() == MVT::i32) {
	// If our operands are only 32-bit wide, make sure we use 32-bit
	// arithmetic for the check whether we can use CSINC. This ensures that
	// the addition in the check will wrap around properly in case there is
	// an overflow (which would not be the case if we do the check with
	// 64-bit arithmetic).
	const uint32_t TrueVal32 = CTVal->getZExtValue();
	const uint32_t FalseVal32 = CFVal->getZExtValue();

	if ((TrueVal32 == FalseVal32 + 1) \|\| (TrueVal32 + 1 == FalseVal32)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal32 > FalseVal32) {
	Swap = true;
	}
	}
	// 64-bit check whether we can use CSINC.
	} else if ((TrueVal == FalseVal + 1) \|\| (TrueVal + 1 == FalseVal)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal > FalseVal) {
	Swap = true;
	}
	}

	// Swap TVal and FVal if necessary.
	if (Swap) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, LHS.getValueType());
	}

	if (Opcode != AArch64ISD::CSEL) {
	// Drop FVal since we can get its value by simply inverting/negating
	// TVal.
	FVal = TVal;
	}
	}

	// Avoid materializing a constant when possible by reusing a known value in
	// a register. However, don't perform this optimization if the known value
	// is one, zero or negative one in the case of a CSEL. We can always
	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
	// FVal, respectively.
	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
	!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
	// "a != C ? x : a" to avoid materializing C.
	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
	TVal = LHS;
	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
	FVal = LHS;
	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
	// avoid materializing C.
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
	Opcode = AArch64ISD::CSINV;
	TVal = LHS;
	FVal = DAG.getConstant(0, dl, FVal.getValueType());
	}
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	EVT VT = TVal.getValueType();
	return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);
	assert(LHS.getValueType() == RHS.getValueType());
	EVT VT = TVal.getValueType();
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two CSELs to implement.
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);

	if (DAG.getTarget().Options.UnsafeFPMath) {
	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
	if (RHSVal && RHSVal->isZero()) {
	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);

	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
	TVal = LHS;
	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
	CFVal && CFVal->isZero() &&
	FVal.getValueType() == LHS.getValueType())
	FVal = LHS;
	}
	}

	// Emit first, and possibly only, CSEL.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	// If we need a second CSEL, emit it, using the output of the first as the
	// RHS. We're effectively OR'ing the two CC's together.
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}

	// Otherwise, return the output of the first CSEL.
	return CS1;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TVal = Op.getOperand(2);
	SDValue FVal = Op.getOperand(3);
	SDLoc DL(Op);
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CCVal = Op->getOperand(0);
	SDValue TVal = Op->getOperand(1);
	SDValue FVal = Op->getOperand(2);
	SDLoc DL(Op);

	EVT Ty = Op.getValueType();
	if (Ty.isScalableVector()) {
	SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
	MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
	SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
	return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
	// instruction.
	if (ISD::isOverflowIntrOpRes(CCVal)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
	return SDValue();

	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);

	return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}

	// Lower it the same way as we would lower a SELECT_CC node.
	ISD::CondCode CC;
	SDValue LHS, RHS;
	if (CCVal.getOpcode() == ISD::SETCC) {
	LHS = CCVal.getOperand(0);
	RHS = CCVal.getOperand(1);
	CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
	} else {
	LHS = CCVal;
	RHS = DAG.getConstant(0, DL, CCVal.getValueType());
	CC = ISD::SETNE;
	}
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(JT, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(JT, DAG);
	}
	return getAddr(JT, DAG);
	}

	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	SDLoc DL(Op);
	SDValue JT = Op.getOperand(1);
	SDValue Entry = Op.getOperand(2);
	int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();

	SDNode *Dest =
	DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
	Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
	return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
	SDValue(Dest, 0));
	}

	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	// Use the GOT for the large code model on iOS.
	if (Subtarget->isTargetMachO()) {
	return getGOT(CP, DAG);
	}
	return getAddrLarge(CP, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(CP, DAG);
	} else {
	return getAddr(CP, DAG);
	}
	}

	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(BA, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(BA, DAG);
	}
	return getAddr(BA, DAG);
	}

	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
	? FuncInfo->getVarArgsGPRIndex()
	: FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	// The layout of the va_list struct is specified in the AArch64 Procedure Call
	// Standard, section B.3.
	MachineFunction &MF = DAG.getMachineFunction();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue VAList = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SmallVector<SDValue, 4> MemOps;

	// void *__stack at offset 0
	SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
	MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
	MachinePointerInfo(SV), /* Alignment = */ 8));

	// void *__gr_top at offset 8
	int GPRSize = FuncInfo->getVarArgsGPRSize();
	if (GPRSize > 0) {
	SDValue GRTop, GRTopAddr;

	GRTopAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));

	GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
	GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
	DAG.getConstant(GPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
	MachinePointerInfo(SV, 8),
	/* Alignment = */ 8));
	}

	// void *__vr_top at offset 16
	int FPRSize = FuncInfo->getVarArgsFPRSize();
	if (FPRSize > 0) {
	SDValue VRTop, VRTopAddr;
	VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(16, DL, PtrVT));

	VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
	VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
	DAG.getConstant(FPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
	MachinePointerInfo(SV, 16),
	/* Alignment = */ 8));
	}

	// int __gr_offs at offset 24
	SDValue GROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
	MachinePointerInfo(SV, 24), /* Alignment = */ 4));

	// int __vr_offs at offset 28
	SDValue VROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
	MachinePointerInfo(SV, 28), /* Alignment = */ 4));

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
	return LowerWin64_VASTART(Op, DAG);
	else if (Subtarget->isTargetDarwin())
	return LowerDarwin_VASTART(Op, DAG);
	else
	return LowerAAPCS_VASTART(Op, DAG);
	}

	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
	// pointer.
	SDLoc DL(Op);
	unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
	unsigned VaListSize = (Subtarget->isTargetDarwin() \|\|
	Subtarget->isTargetWindows()) ? PtrSize : 32;
	const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

	return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(VaListSize, DL, MVT::i32),
	Align(PtrSize), false, false, false,
	MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
	}

	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"automatic va_arg instruction only works on Darwin");

	const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	MaybeAlign Align(Op.getConstantOperandVal(3));
	unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
	SDValue VAList =
	DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
	Chain = VAList.getValue(1);
	VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);

	if (Align && *Align > MinSlotSize) {
	VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(Align->value() - 1, DL, PtrVT));
	VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
	DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
	}

	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

	// Scalar integer and FP values smaller than 64 bits are implicitly extended
	// up to 64 bits. At the very least, we have to increase the striding of the
	// vaargs list to match this, and for FP values we need to introduce
	// FP_ROUND nodes as well.
	if (VT.isInteger() && !VT.isVector())
	ArgSize = std::max(ArgSize, MinSlotSize);
	bool NeedFPTrunc = false;
	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
	ArgSize = 8;
	NeedFPTrunc = true;
	}

	// Increment the pointer, VAList, to the next vaarg
	SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(ArgSize, DL, PtrVT));
	VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);

	// Store the incremented VAList to the legalized pointer
	SDValue APStore =
	DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

	// Load the actual argument out of the pointer VAList
	if (NeedFPTrunc) {
	// Load the value as an f64.
	SDValue WideFP =
	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
	// Round the value down to an f32.
	SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
	DAG.getIntPtrConstant(1, DL));
	SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
	// Merge the rounded value with the chain output of the load.
	return DAG.getMergeValues(Ops, DL);
	}

	return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
	}

	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());

	if (Subtarget->isTargetILP32())
	FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
	DAG.getValueType(VT));

	return FrameAddr;
	}

	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

	EVT VT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	int FI = MFI.CreateFixedObject(4, 0, false);
	return DAG.getFrameIndex(FI, VT);
	}

	#define GET_REGISTER_MATCHER
	#include "AArch64GenAsmMatcher.inc"

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register AArch64TargetLowering::
	getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
	Register Reg = MatchRegisterName(RegName);
	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
	const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
	unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
	if (!Subtarget->isXRegisterReserved(DwarfRegNum))
	Reg = 0;
	}
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));

	return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
	}

	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	HiBitsForLo =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	HiBitsForLo, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));

	SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue LoForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	// AArch64 shifts larger than the register width are wrapped rather than
	// clamped, so we can't just emit "hi >> x".
	SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiForBigShift =
	Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i64))
	: DAG.getConstant(0, dl, VT);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	LoBitsForHi =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	LoBitsForHi, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));
	SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);

	SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	// AArch64 shifts of larger than register sizes are wrapped rather than
	// clamped, so we can't just emit "lo << a" if a is too big.
	SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
	SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	bool AArch64TargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	// Offsets are folded in the DAG combine rather than here so that we can
	// intelligently choose an offset based on the uses.
	return false;
	}

	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool OptForSize) const {
	bool IsLegal = false;
	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
	// 16-bit case when target has full fp16 support.
	// FIXME: We should be able to handle f128 as well with a clever lowering.
	const APInt ImmInt = Imm.bitcastToAPInt();
	if (VT == MVT::f64)
	IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f32)
	IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f16 && Subtarget->hasFullFP16())
	IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	// TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
	// generate that fmov.

	// If we can not materialize in immediate field for fmov, check if the
	// value can be encoded as the immediate operand of a logical instruction.
	// The immediate value will be created with either MOVZ, MOVN, or ORR.
	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
	// however the mov+fmov sequence is always better because of the reduced
	// cache pressure. The timings are still the same if you consider
	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
	// movw+movk is fused). So we limit up to 2 instrdduction at most.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
	Insn);
	unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
	IsLegal = Insn.size() <= Limit;
	}

	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
	<< " imm value: "; Imm.dump(););
	return IsLegal;
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Optimization Hooks
	//===----------------------------------------------------------------------===//

	static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
	SDValue Operand, SelectionDAG &DAG,
	int &ExtraSteps) {
	EVT VT = Operand.getValueType();
	if (ST->hasNEON() &&
	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\|
	VT == MVT::v2f32 \|\| VT == MVT::v4f32)) {
	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
	// For the reciprocal estimates, convergence is quadratic, so the number
	// of digits is doubled after each iteration. In ARMv8, the accuracy of
	// the initial estimate is 2^-8. Thus the number of extra steps to refine
	// the result for float (23 mantissa bits) is 2 and for double (52
	// mantissa bits) is 3.
	ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;

	return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps,
	bool &UseOneConst,
	bool Reciprocal) const {
	if (Enabled == ReciprocalEstimate::Enabled \|\|
	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
	// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
	Flags);
	Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}
	if (!Reciprocal) {
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
	// Correct the result if the operand is 0.0.
	Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
	VT, Eq, Operand, Estimate);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const {
	if (Enabled == ReciprocalEstimate::Enabled)
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal iteration: E * (2 - X * E)
	// AArch64 reciprocal iteration instruction: (2 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
	Estimate, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Table of Constraints
	// TODO: This is the current set of constraints supported by ARM for the
	// compiler, not all of them may make sense.
	//
	// r - A general register
	// w - An FP/SIMD register of some size in the range v0-v31
	// x - An FP/SIMD register of some size in the range v0-v15
	// I - Constant that can be used with an ADD instruction
	// J - Constant that can be used with a SUB instruction
	// K - Constant that can be used with a 32-bit logical instruction
	// L - Constant that can be used with a 64-bit logical instruction
	// M - Constant that can be used as a 32-bit MOV immediate
	// N - Constant that can be used as a 64-bit MOV immediate
	// Q - A memory reference with base register and no offset
	// S - A symbolic address
	// Y - Floating point constant zero
	// Z - Integer constant zero
	//
	// Note that general register operands will be output using their 64-bit x
	// register name, whatever the size of the variable, unless the asm operand
	// is prefixed by the %w modifier. Floating-point and SIMD register operands
	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
	// %q modifier.
	const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasFPARMv8())
	return "r";

	if (ConstraintVT.isFloatingPoint())
	return "w";

	if (ConstraintVT.isVector() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	enum PredicateConstraint {
	Upl,
	Upa,
	Invalid
	};

	static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
	PredicateConstraint P = PredicateConstraint::Invalid;
	if (Constraint == "Upa")
	P = PredicateConstraint::Upa;
	if (Constraint == "Upl")
	P = PredicateConstraint::Upl;
	return P;
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	AArch64TargetLowering::ConstraintType
	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'x':
	case 'w':
	case 'y':
	return C_RegisterClass;
	// An address with a single base register. Due to the way we
	// currently handle addresses it is the same as 'r'.
	case 'Q':
	return C_Memory;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'Y':
	case 'Z':
	return C_Immediate;
	case 'z':
	case 'S': // A symbolic address
	return C_Other;
	}
	} else if (parsePredicateConstraint(Constraint) !=
	PredicateConstraint::Invalid)
	return C_RegisterClass;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	AArch64TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'x':
	case 'w':
	case 'y':
	if (type->isFloatingPointTy() \|\| type->isVectorTy())
	weight = CW_Register;
	break;
	case 'z':
	weight = CW_Constant;
	break;
	case 'U':
	if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
	weight = CW_Register;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AArch64TargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::GPR64commonRegClass);
	return std::make_pair(0U, &AArch64::GPR32commonRegClass);
	case 'w':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.isScalableVector())
	return std::make_pair(0U, &AArch64::ZPRRegClass);
	if (VT.getSizeInBits() == 16)
	return std::make_pair(0U, &AArch64::FPR16RegClass);
	if (VT.getSizeInBits() == 32)
	return std::make_pair(0U, &AArch64::FPR32RegClass);
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::FPR64RegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128RegClass);
	break;
	// The instructions that this constraint is designed for can
	// only take 128-bit registers so just use that regclass.
	case 'x':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.isScalableVector())
	return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128_loRegClass);
	break;
	case 'y':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.isScalableVector())
	return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
	break;
	}
	} else {
	PredicateConstraint PC = parsePredicateConstraint(Constraint);
	if (PC != PredicateConstraint::Invalid) {
	assert(VT.isScalableVector());
	bool restricted = (PC == PredicateConstraint::Upl);
	return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
	: std::make_pair(0U, &AArch64::PPRRegClass);
	}
	}
	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass *> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	unsigned Size = Constraint.size();
	if ((Size == 4 \|\| Size == 5) && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
	int RegNo;
	bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
	if (!Failed && RegNo >= 0 && RegNo <= 31) {
	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
	// By default we'll emit v0-v31 for this unless there's a modifier where
	// we'll emit the correct register as well.
	if (VT != MVT::Other && VT.getSizeInBits() == 64) {
	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR64RegClass;
	} else {
	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR128RegClass;
	}
	}
	}
	}

	if (Res.second && !Subtarget->hasFPARMv8() &&
	!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
	!AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
	return std::make_pair(0U, nullptr);

	return Res;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void AArch64TargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;

	// This set of constraints deal with valid constants for various instructions.
	// Validate and return a target constant for them if we can.
	case 'z': {
	// 'z' maps to xzr or wzr so it needs an input of 0.
	if (!isNullConstant(Op))
	return;

	if (Op.getValueType() == MVT::i64)
	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
	else
	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
	break;
	}
	case 'S': {
	// An absolute symbolic address or label reference.
	if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
	Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
	GA->getValueType(0));
	} else if (const BlockAddressSDNode *BA =
	dyn_cast<BlockAddressSDNode>(Op)) {
	Result =
	DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
	} else if (const ExternalSymbolSDNode *ES =
	dyn_cast<ExternalSymbolSDNode>(Op)) {
	Result =
	DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
	} else
	return;
	break;
	}

	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	// Grab the value and do some validation.
	uint64_t CVal = C->getZExtValue();
	switch (ConstraintLetter) {
	// The I constraint applies only to simple ADD or SUB immediate operands:
	// i.e. 0 to 4095 with optional shift by 12
	// The J constraint applies only to ADD or SUB immediates that would be
	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
	// instruction [or vice versa], in other words -1 to -4095 with optional
	// left shift by 12.
	case 'I':
	if (isUInt<12>(CVal) \|\| isShiftedUInt<12, 12>(CVal))
	break;
	return;
	case 'J': {
	uint64_t NVal = -C->getSExtValue();
	if (isUInt<12>(NVal) \|\| isShiftedUInt<12, 12>(NVal)) {
	CVal = C->getSExtValue();
	break;
	}
	return;
	}
	// The K and L constraints apply only to logical immediates, including
	// what used to be the MOVI alias for ORR (though the MOVI alias has now
	// been removed and MOV should be used). So these constraints have to
	// distinguish between bit patterns that are valid 32-bit or 64-bit
	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
	// versa.
	case 'K':
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	return;
	case 'L':
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	return;
	// The M and N constraints are a superset of K and L respectively, for use
	// with the MOV (immediate) alias. As well as the logical immediates they
	// also match 32 or 64-bit immediates that can be loaded either using a
	// single MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
	// (M) or 64-bit 0x1234000000000000 (N) etc.
	// As a note some of this code is liberally stolen from the asm parser.
	case 'M': {
	if (!isUInt<32>(CVal))
	return;
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	if ((CVal & 0xFFFF) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	uint64_t NCVal = ~(uint32_t)CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	return;
	}
	case 'N': {
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	if ((CVal & 0xFFFFULL) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF00000000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF000000000000ULL) == CVal)
	break;
	uint64_t NCVal = ~CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF00000000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
	break;
	return;
	}
	default:
	return;
	}

	// All assembler immediates are 64-bit integers.
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Advanced SIMD Support
	//===----------------------------------------------------------------------===//

	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
	V64Reg, DAG.getConstant(0, DL, MVT::i32));
	}

	/// getExtFactor - Determine the adjustment factor for the position when
	/// generating an "extract from vector registers" instruction.
	static unsigned getExtFactor(SDValue &V) {
	EVT EltType = V.getValueType().getVectorElementType();
	return EltType.getSizeInBits() / 8;
	}

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
	SDLoc DL(V128Reg);

	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt;
	unsigned MaxElt;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase;
	int WindowScale;

	ShuffleSourceInfo(SDValue Vec)
	: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
	ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(V.getOperand(1))) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: "
	"a shuffle can only come from building a vector from "
	"various elements of other vectors, provided their "
	"indices are constant\n");
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	if (Sources.size() > 2) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: currently only do something sane when at "
	"most two source vectors are involved\n");
	return SDValue();
	}

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy)) {
	SmallestEltTy = SrcEltTy;
	}
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);

	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Imm, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	LLVM_DEBUG(for (auto Src
	: Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT););

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined =
	std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
	return SDValue();
	}

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);

	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
	dbgs() << "Reshuffle, creating node: "; V.dump(););

	return V;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are different.
	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
	unsigned &Imm) {
	// Look for the first non-undef element.
	const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });

	// Benefit form APInt to handle overflow when calculating expected element.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
	APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
	// The following shuffle indices must be the successive elements after the
	// first real element.
	const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
	[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
	if (FirstWrongElt != M.end())
	return false;

	// The index of an EXT is the first element if it is not UNDEF.
	// Watch out for the beginning UNDEFs. The EXT index should be the expected
	// value of the first element. E.g.
	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
	// ExpectedElt is the last mask index plus 1.
	Imm = ExpectedElt.getZExtValue();

	// There are two difference cases requiring to reverse input vectors.
	// For example, for vector <4 x i32> we have the following cases,
	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
	// to reverse two input vectors.
	if (Imm < NumElts)
	ReverseEXT = true;
	else
	Imm -= NumElts;

	return true;
	}

	/// isREVMask - Check if a vector shuffle corresponds to a REV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for REV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
	return false;
	Idx += 1;
	}

	return true;
	}

	static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != 2 * i + WhichResult)
	return false;
	}

	return true;
	}

	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
	return false;
	}
	return true;
	}

	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
	return false;
	Idx += 1;
	}

	return true;
	}

	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned Half = VT.getVectorNumElements() / 2;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned j = 0; j != 2; ++j) {
	unsigned Idx = WhichResult;
	for (unsigned i = 0; i != Half; ++i) {
	int MIdx = M[i + j * Half];
	if (MIdx >= 0 && (unsigned)MIdx != Idx)
	return false;
	Idx += 2;
	}
	}

	return true;
	}

	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
	return false;
	}
	return true;
	}

	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
	bool &DstIsLeft, int &Anomaly) {
	if (M.size() != static_cast<size_t>(NumInputElements))
	return false;

	int NumLHSMatch = 0, NumRHSMatch = 0;
	int LastLHSMismatch = -1, LastRHSMismatch = -1;

	for (int i = 0; i < NumInputElements; ++i) {
	if (M[i] == -1) {
	++NumLHSMatch;
	++NumRHSMatch;
	continue;
	}

	if (M[i] == i)
	++NumLHSMatch;
	else
	LastLHSMismatch = i;

	if (M[i] == i + NumInputElements)
	++NumRHSMatch;
	else
	LastRHSMismatch = i;
	}

	if (NumLHSMatch == NumInputElements - 1) {
	DstIsLeft = true;
	Anomaly = LastLHSMismatch;
	return true;
	} else if (NumRHSMatch == NumInputElements - 1) {
	DstIsLeft = false;
	Anomaly = LastRHSMismatch;
	return true;
	}

	return false;
	}

	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
	if (VT.getSizeInBits() != 128)
	return false;

	unsigned NumElts = VT.getVectorNumElements();

	for (int I = 0, E = NumElts / 2; I != E; I++) {
	if (Mask[I] != I)
	return false;
	}

	int Offset = NumElts / 2;
	for (int I = NumElts / 2, E = NumElts; I != E; I++) {
	if (Mask[I] != I + SplitLHS * Offset)
	return false;
	}

	return true;
	}

	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue V0 = Op.getOperand(0);
	SDValue V1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
	return SDValue();

	bool SplitV0 = V0.getValueSizeInBits() == 128;

	if (!isConcatMask(Mask, VT, SplitV0))
	return SDValue();

	EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
	if (SplitV0) {
	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
	DAG.getConstant(0, DL, MVT::i64));
	}
	if (V1.getValueSizeInBits() == 128) {
	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
	DAG.getConstant(0, DL, MVT::i64));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (1 * 9 + 2) * 9 + 3)
	return LHS;
	assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default:
	llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> REV32
	if (VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::f16 \|\|
	VT.getVectorElementType() == MVT::bf16)
	return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> REV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3: {
	EVT EltTy = VT.getVectorElementType();
	unsigned Opcode;
	if (EltTy == MVT::i8)
	Opcode = AArch64ISD::DUPLANE8;
	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16 \|\| EltTy == MVT::bf16)
	Opcode = AArch64ISD::DUPLANE16;
	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
	Opcode = AArch64ISD::DUPLANE32;
	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
	Opcode = AArch64ISD::DUPLANE64;
	else
	llvm_unreachable("Invalid vector element type?");

	if (VT.getSizeInBits() == 64)
	OpLHS = WidenVector(OpLHS, DAG);
	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
	return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
	}
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3: {
	unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
	DAG.getConstant(Imm, dl, MVT::i32));
	}
	case OP_VUZPL:
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VUZPR:
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPL:
	return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPR:
	return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNL:
	return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNR:
	return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	}
	}

	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the TBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	EVT EltVT = Op.getValueType().getVectorElementType();
	unsigned BytesPerElt = EltVT.getSizeInBits() / 8;

	SmallVector<SDValue, 8> TBLMask;
	for (int Val : ShuffleMask) {
	for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
	unsigned Offset = Byte + Val * BytesPerElt;
	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
	}
	}

	MVT IndexVT = MVT::v8i8;
	unsigned IndexLen = 8;
	if (Op.getValueSizeInBits() == 128) {
	IndexVT = MVT::v16i8;
	IndexLen = 16;
	}

	SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
	SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

	SDValue Shuffle;
	if (V2.getNode()->isUndef()) {
	if (IndexLen == 8)
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	if (IndexLen == 8) {
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
	// cannot currently represent the register constraints on the input
	// table registers.
	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
	// IndexLen));
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
	V2Cst, DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	}
	}
	return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
	}

	static unsigned getDUPLANEOp(EVT EltType) {
	if (EltType == MVT::i8)
	return AArch64ISD::DUPLANE8;
	if (EltType == MVT::i16 \|\| EltType == MVT::f16 \|\| EltType == MVT::bf16)
	return AArch64ISD::DUPLANE16;
	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
	return AArch64ISD::DUPLANE32;
	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
	return AArch64ISD::DUPLANE64;

	llvm_unreachable("Invalid vector element type?");
	}

	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);

	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1)
	Lane = 0;

	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
	V1.getOperand(0));
	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
	// constant. If so, we can just reference the lane's definition directly.
	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(Lane)))
	return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));

	// Otherwise, duplicate from the lane of the input vector.
	unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

	// Try to eliminate a bitcasted extract subvector before a DUPLANE.
	auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
	// Match: dup (bitcast (extract_subv X, C)), LaneC
	if (BitCast.getOpcode() != ISD::BITCAST \|\|
	BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;

	// The extract index must align in the destination type. That may not
	// happen if the bitcast is from narrow to wide type.
	SDValue Extract = BitCast.getOperand(0);
	unsigned ExtIdx = Extract.getConstantOperandVal(1);
	unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
	unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
	unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
	if (ExtIdxInBits % CastedEltBitWidth != 0)
	return false;

	// Update the lane value by offsetting with the scaled extract index.
	LaneC += ExtIdxInBits / CastedEltBitWidth;

	// Determine the casted vector type of the wide vector input.
	// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
	// Examples:
	// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
	// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
	unsigned SrcVecNumElts =
	Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
	CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
	SrcVecNumElts);
	return true;
	};
	MVT CastVT;
	if (getScaledOffsetDup(V1, Lane, CastVT)) {
	V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
	} else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	// The lane is incremented by the index of the extract.
	// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
	Lane += V1.getConstantOperandVal(1);
	V1 = V1.getOperand(0);
	} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
	// The lane is decremented if we are splatting from the 2nd operand.
	// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
	Lane -= Idx * VT.getVectorNumElements() / 2;
	V1 = WidenVector(V1.getOperand(Idx), DAG);
	} else if (VT.getSizeInBits() == 64) {
	// Widen the operand to 128-bit register with undef.
	V1 = WidenVector(V1, DAG);
	}
	return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
	}

	if (isREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

	bool ReverseEXT = false;
	unsigned Imm;
	if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
	if (ReverseEXT)
	std::swap(V1, V2);
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	unsigned WhichResult;
	if (isZIPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isUZPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isTRNMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}

	if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}

	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
	return Concat;

	bool DstIsLeft;
	int Anomaly;
	int NumInputElements = V1.getValueType().getVectorNumElements();
	if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
	SDValue DstVec = DstIsLeft ? V1 : V2;
	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);

	SDValue SrcVec = V1;
	int SrcLane = ShuffleMask[Anomaly];
	if (SrcLane >= NumInputElements) {
	SrcVec = V2;
	SrcLane -= VT.getVectorNumElements();
	}
	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);

	EVT ScalarVT = VT.getVectorElementType();

	if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
	ScalarVT = MVT::i32;

	return DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
	DstLaneV);
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	return GenerateTBL(Op, ShuffleMask, DAG);
	}

	SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT ElemVT = VT.getScalarType();

	SDValue SplatVal = Op.getOperand(0);

	// Extend input splat value where needed to fit into a GPR (32b or 64b only)
	// FPRs don't have this restriction.
	switch (ElemVT.getSimpleVT().SimpleTy) {
	case MVT::i1: {
	// The only legal i1 vectors are SVE vectors, so we can use SVE-specific
	// lowering code.
	if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
	if (ConstVal->isOne())
	return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
	// TODO: Add special case for constant false
	}
	// The general case of i1. There isn't any natural way to do this,
	// so we use some trickery with whilelo.
	SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
	SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
	DAG.getValueType(MVT::i1));
	SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
	MVT::i64);
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
	DAG.getConstant(0, dl, MVT::i64), SplatVal);
	}
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
	break;
	case MVT::i64:
	SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
	break;
	case MVT::f16:
	case MVT::bf16:
	case MVT::f32:
	case MVT::f64:
	// Fine as is
	break;
	default:
	report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
	}

	return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
	}

	SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);

	EVT VT = Op.getValueType();
	if (!isTypeLegal(VT) \|\| !VT.isScalableVector())
	return SDValue();

	// Current lowering only supports the SVE-ACLE types.
	if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
	return SDValue();

	// The DUPQ operation is indepedent of element type so normalise to i64s.
	SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
	SDValue Idx128 = Op.getOperand(2);

	// DUPQ can be used when idx is in range.
	auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
	if (CIdx && (CIdx->getZExtValue() <= 3)) {
	SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
	SDNode *DUPQ =
	DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
	return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
	}

	// The ACLE says this must produce the same result as:
	// svtbl(data, svadd_x(svptrue_b64(),
	// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
	// index * 2))
	SDValue One = DAG.getConstant(1, DL, MVT::i64);
	SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);

	// create the vector 0,1,0,1,...
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
	DL, MVT::nxv2i64, Zero, One);
	SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);

	// create the vector idx64,idx64+1,idx64,idx64+1,...
	SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
	SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
	SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);

	// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
	SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
	return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
	}


	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
	APInt &UndefBits) {
	EVT VT = BVN->getValueType(0);
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;

	for (unsigned i = 0; i < NumSplats; ++i) {
	CnstBits <<= SplatBitSize;
	UndefBits <<= SplatBitSize;
	CnstBits \|= SplatBits.zextOrTrunc(VT.getSizeInBits());
	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
	}

	return true;
	}

	return false;
	}

	// Try 64-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;

	if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
	Shift = 8;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
	Shift = 16;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
	Shift = 24;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 16-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
	Shift = 8;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate with shifted ones.
	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
	SelectionDAG &DAG, const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
	Shift = 264;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
	Shift = 272;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 8-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;

	if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try FP splatted SIMD immediate.
	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	bool isWide = (VT.getSizeInBits() == 128);
	MVT MovTy;
	bool isAdvSIMDModImm = false;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
	}
	else if (isWide &&
	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
	MovTy = MVT::v2f64;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Specialized code to quickly find if PotentialBVec is a BuildVector that
	// consists of only the same constant int value, returned in reference arg
	// ConstVal
	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
	uint64_t &ConstVal) {
	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
	if (!Bvec)
	return false;
	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
	if (!FirstElt)
	return false;
	EVT VT = Bvec->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 1; i < NumElts; ++i)
	if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
	return false;
	ConstVal = FirstElt->getZExtValue();
	return true;
	}

	static unsigned getIntrinsicID(const SDNode *N) {
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default:
	return Intrinsic::not_intrinsic;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	if (IID < Intrinsic::num_intrinsics)
	return IID;
	return Intrinsic::not_intrinsic;
	}
	}
	}

	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
	// BUILD_VECTORs with constant element C1, C2 is a constant, and:
	// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
	// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
	// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	SDLoc DL(N);

	SDValue And;
	SDValue Shift;

	SDValue FirstOp = N->getOperand(0);
	unsigned FirstOpc = FirstOp.getOpcode();
	SDValue SecondOp = N->getOperand(1);
	unsigned SecondOpc = SecondOp.getOpcode();

	// Is one of the operands an AND or a BICi? The AND may have been optimised to
	// a BICi in order to use an immediate instead of a register.
	// Is the other operand an shl or lshr? This will have been turned into:
	// AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
	if ((FirstOpc == ISD::AND \|\| FirstOpc == AArch64ISD::BICi) &&
	(SecondOpc == AArch64ISD::VSHL \|\| SecondOpc == AArch64ISD::VLSHR)) {
	And = FirstOp;
	Shift = SecondOp;

	} else if ((SecondOpc == ISD::AND \|\| SecondOpc == AArch64ISD::BICi) &&
	(FirstOpc == AArch64ISD::VSHL \|\| FirstOpc == AArch64ISD::VLSHR)) {
	And = SecondOp;
	Shift = FirstOp;
	} else
	return SDValue();

	bool IsAnd = And.getOpcode() == ISD::AND;
	bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;

	// Is the shift amount constant?
	ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!C2node)
	return SDValue();

	uint64_t C1;
	if (IsAnd) {
	// Is the and mask vector all constant?
	if (!isAllConstantBuildVector(And.getOperand(1), C1))
	return SDValue();
	} else {
	// Reconstruct the corresponding AND immediate from the two BICi immediates.
	ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
	ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
	assert(C1nodeImm && C1nodeShift);
	C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
	}

	// Is C1 == ~(Ones(ElemSizeInBits) << C2) or
	// C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
	// how much one can shift elements of a particular size?
	uint64_t C2 = C2node->getZExtValue();
	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	if (C2 > ElemSizeInBits)
	return SDValue();

	APInt C1AsAPInt(ElemSizeInBits, C1);
	APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
	: APInt::getLowBitsSet(ElemSizeInBits, C2);
	if (C1AsAPInt != RequiredC1)
	return SDValue();

	SDValue X = And.getOperand(0);
	SDValue Y = Shift.getOperand(0);

	unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
	SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));

	LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
	LLVM_DEBUG(N->dump(&DAG));
	LLVM_DEBUG(dbgs() << "into: \n");
	LLVM_DEBUG(ResultSLI->dump(&DAG));

	++NumShiftInserts;
	return ResultSLI;
	}

	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
	SelectionDAG &DAG) const {
	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
	if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
	return Res;

	EVT VT = Op.getValueType();

	SDValue LHS = Op.getOperand(0);
	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	if (!BVN) {
	// OR commutes, so try swapping the operands.
	LHS = Op.getOperand(1);
	BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
	}
	if (!BVN)
	return Op;

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)))
	return NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	// We can always fall back to a non-immediate OR.
	return Op;
	}

	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
	// be truncated to fit element width.
	static SDValue NormalizeBuildVector(SDValue Op,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT EltTy= VT.getVectorElementType();

	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > 16)
	return Op;

	SmallVector<SDValue, 16> Ops;
	for (SDValue Lane : Op->ops()) {
	// For integer vectors, type legalization would have promoted the
	// operands already. Otherwise, if Op is a floating-point splat
	// (with operands cast to integers), then the only possibilities
	// are constants and UNDEFs.
	if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
	APInt LowBits(EltTy.getSizeInBits(),
	CstLane->getZExtValue());
	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
	} else if (Lane.getNode()->isUndef()) {
	Lane = DAG.getUNDEF(MVT::i32);
	} else {
	assert(Lane.getValueType() == MVT::i32 &&
	"Unexpected BUILD_VECTOR operand type");
	}
	Ops.push_back(Lane);
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;

	DefBits = UndefBits;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// Try to build a simple constant vector.
	Op = NormalizeBuildVector(Op, DAG);
	if (VT.isInteger()) {
	// Certain vector constants, used to express things like logical NOT and
	// arithmetic NEG, are passed through unmodified. This allows special
	// patterns for these operations to match, which will lower these constants
	// to whatever is proven necessary.
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (BVN->isConstant())
	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
	APInt Val(BitSize,
	Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
	if (Val.isNullValue() \|\| Val.isAllOnesValue())
	return Op;
	}
	}

	if (SDValue V = ConstantBuildVector(Op, DAG))
	return V;

	// Scan through the operands to find some interesting properties we can
	// exploit:
	// 1) If only one value is used, we can use a DUP, or
	// 2) if only the low element is not undef, we can just insert that, or
	// 3) if only one constant value is used (w/ some non-constant lanes),
	// we can splat the constant value into the whole vector then fill
	// in the non-constant lanes.
	// 4) FIXME: If different constant values are used, but we can intelligently
	// select the values we'll be overwriting for the non-constant
	// lanes such that we can directly materialize the vector
	// some other way (MOVI, e.g.), we can be sneaky.
	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
	SDLoc dl(Op);
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool usesOnlyOneConstantValue = true;
	bool isConstant = true;
	bool AllLanesExtractElt = true;
	unsigned NumConstantLanes = 0;
	SDValue Value;
	SDValue ConstantValue;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	AllLanesExtractElt = false;
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (isa<ConstantSDNode>(V) \|\| isa<ConstantFPSDNode>(V)) {
	++NumConstantLanes;
	if (!ConstantValue.getNode())
	ConstantValue = V;
	else if (ConstantValue != V)
	usesOnlyOneConstantValue = false;
	}

	if (!Value.getNode())
	Value = V;
	else if (V != Value)
	usesOnlyOneValue = false;
	}

	if (!Value.getNode()) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
	return DAG.getUNDEF(VT);
	}

	// Convert BUILD_VECTOR where all elements but the lowest are undef into
	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
	if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
	"SCALAR_TO_VECTOR node\n");
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
	}

	if (AllLanesExtractElt) {
	SDNode *Vector = nullptr;
	bool Even = false;
	bool Odd = false;
	// Check whether the extract elements match the Even pattern <0,2,4,...> or
	// the Odd pattern <1,3,5,...>.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	const SDNode *N = V.getNode();
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	break;
	SDValue N0 = N->getOperand(0);

	// All elements are extracted from the same vector.
	if (!Vector) {
	Vector = N0.getNode();
	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
	// BUILD_VECTOR.
	if (VT.getVectorElementType() !=
	N0.getValueType().getVectorElementType())
	break;
	} else if (Vector != N0.getNode()) {
	Odd = false;
	Even = false;
	break;
	}

	// Extracted values are either at Even indices <0,2,4,...> or at Odd
	// indices <1,3,5,...>.
	uint64_t Val = N->getConstantOperandVal(1);
	if (Val == 2 * i) {
	Even = true;
	continue;
	}
	if (Val - 1 == 2 * i) {
	Odd = true;
	continue;
	}

	// Something does not match: abort.
	Odd = false;
	Even = false;
	break;
	}
	if (Even \|\| Odd) {
	SDValue LHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(0, dl, MVT::i64));
	SDValue RHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(NumElts, dl, MVT::i64));

	if (Even && !Odd)
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	if (Odd && !Even)
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	}
	}

	// Use DUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (usesOnlyOneValue) {
	if (!isConstant) {
	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Value.getValueType() != VT) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
	return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
	}

	// This is actually a DUPLANExx operation, which keeps everything vectory.

	SDValue Lane = Value.getOperand(1);
	Value = Value.getOperand(0);
	if (Value.getValueSizeInBits() == 64) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
	"widening it\n");
	Value = WidenVector(Value, DAG);
	}

	unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
	return DAG.getNode(Opcode, dl, VT, Value, Lane);
	}

	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	EVT EltTy = VT.getVectorElementType();
	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::bf16 \|\| EltTy == MVT::f32 \|\|
	EltTy == MVT::f64) && "Unsupported floating-point vector type");
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
	"BITCASTS, and try again\n");
	MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
	Val.dump(););
	Val = LowerBUILD_VECTOR(Val, DAG);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	}

	// If there was only one constant value used and for more than one lane,
	// start by splatting that value, then replace the non-constant lanes. This
	// is better than the default, which will perform a separate initialization
	// for each lane.
	if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
	// Firstly, try to materialize the splat constant.
	SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
	Val = ConstantBuildVector(Vec, DAG);
	if (!Val) {
	// Otherwise, materialize the constant and splat it.
	Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
	DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
	}

	// Now insert the non-constant lanes.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
	// Note that type legalization likely mucked about with the VT of the
	// source operand, so we may have to convert it here before inserting.
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
	}
	return Val;
	}

	// This will generate a load from the constant pool.
	if (isConstant) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
	"expansion\n");
	return SDValue();
	}

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	if (SDValue shuffle = ReconstructShuffle(Op, DAG))
	return shuffle;
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
	"of INSERT_VECTOR_ELT\n");

	SDValue Vec = DAG.getUNDEF(VT);
	SDValue Op0 = Op.getOperand(0);
	unsigned i = 0;

	// Use SCALAR_TO_VECTOR for lane zero to
	// a) Avoid a RMW dependency on the full vector register, and
	// b) Allow the register coalescer to fold away the copy if the
	// value is already in an S or D register, and we're forced to emit an
	// INSERT_SUBREG that we can't fold anywhere.
	//
	// We also allow types like i8 and i16 which are illegal scalar but legal
	// vector element types. After type-legalization the inserted value is
	// extended (i32) and it is safe to cast them to the vector type by ignoring
	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
	if (!Op0.isUndef()) {
	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
	++i;
	}
	LLVM_DEBUG(if (i < NumElts) dbgs()
	<< "Creating nodes for the other vector elements:\n";);
	for (; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
	"better alternative\n");
	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
	VT != MVT::v4bf16)
	return SDValue();

	// For V64 types, we perform insertion by expanding the value
	// to a V128 type and perform the insertion on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
	Op.getOperand(1), Op.getOperand(2));
	// Re-narrow the resultant vector.
	return NarrowVector(Node, DAG);
	}

	SDValue
	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
	VT != MVT::v4bf16)
	return SDValue();

	// For V64 types, we perform extraction by expanding the value
	// to a V128 type and perform the extraction on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	EVT ExtrTy = WideTy.getVectorElementType();
	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
	ExtrTy = MVT::i32;

	// For extractions, we just return the result directly.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
	Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType().isFixedLengthVector() &&
	"Only cases that extract a fixed length vector are supported!");

	EVT InVT = Op.getOperand(0).getValueType();
	unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	unsigned Size = Op.getValueSizeInBits();

	if (InVT.isScalableVector()) {
	// This will be matched by custom code during ISelDAGToDAG.
	if (Idx == 0 && isPackedVectorType(InVT, DAG))
	return Op;

	return SDValue();
	}

	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
	if (Idx == 0 && InVT.getSizeInBits() <= 128)
	return Op;

	// If this is extracting the upper 64-bits of a 128-bit vector, we match
	// that directly.
	if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
	return Op;

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType().isScalableVector() &&
	"Only expect to lower inserts into scalable vectors!");

	EVT InVT = Op.getOperand(1).getValueType();
	unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

	// We don't have any patterns for scalable vector yet.
	if (InVT.isScalableVector() \|\| !useSVEForFixedLengthVectorVT(InVT))
	return SDValue();

	// This will be matched by custom code during ISelDAGToDAG.
	if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
	return Op;

	return SDValue();
	}

	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	// Currently no fixed length shuffles that require SVE are legal.
	if (useSVEForFixedLengthVectorVT(VT))
	return false;

	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return true;
	}

	bool DummyBool;
	int DummyInt;
	unsigned DummyUnsigned;

	return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\| isREVMask(M, VT, 64) \|\|
	isREVMask(M, VT, 32) \|\| isREVMask(M, VT, 16) \|\|
	isEXTMask(M, VT, DummyBool, DummyUnsigned) \|\|
	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
	isTRNMask(M, VT, DummyUnsigned) \|\| isUZPMask(M, VT, DummyUnsigned) \|\|
	isZIPMask(M, VT, DummyUnsigned) \|\|
	isTRN_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isUZP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isZIP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) \|\|
	isConcatMask(M, VT, VT.getSizeInBits() == 128));
	}

	/// getVShiftImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\| !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. The value must be in the range:
	/// 1 <= Value <= ElementBits for a right shift; or
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	}

	// Attempt to form urhadd(OpA, OpB) from
	// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
	// The original form of this expression is
	// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
	// is called the srl will have been lowered to AArch64ISD::VLSHR and the
	// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
	// This pass can also recognize a variant of this pattern that uses sign
	// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
	SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	if (VT.getScalarType() == MVT::i1) {
	// Lower i1 truncate to `(x & 1) != 0`.
	SDLoc dl(Op);
	EVT OpVT = Op.getOperand(0).getValueType();
	SDValue Zero = DAG.getConstant(0, dl, OpVT);
	SDValue One = DAG.getConstant(1, dl, OpVT);
	SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
	return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
	}

	if (!VT.isVector() \|\| VT.isScalableVector())
	return Op;

	if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
	return LowerFixedLengthVectorTruncateToSVE(Op, DAG);

	// Since we are looking for a right shift by a constant value of 1 and we are
	// operating on types at least 16 bits in length (sign/zero extended OpA and
	// OpB, which are at least 8 bits), it follows that the truncate will always
	// discard the shifted-in bit and therefore the right shift will be logical
	// regardless of the signedness of OpA and OpB.
	SDValue Shift = Op.getOperand(0);
	if (Shift.getOpcode() != AArch64ISD::VLSHR)
	return Op;

	// Is the right shift using an immediate value of 1?
	uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
	if (ShiftAmount != 1)
	return Op;

	SDValue Sub = Shift->getOperand(0);
	if (Sub.getOpcode() != ISD::SUB)
	return Op;

	SDValue Xor = Sub.getOperand(1);
	if (Xor.getOpcode() != ISD::XOR)
	return Op;

	SDValue ExtendOpA = Xor.getOperand(0);
	SDValue ExtendOpB = Sub.getOperand(0);
	unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
	unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
	if (!(ExtendOpAOpc == ExtendOpBOpc &&
	(ExtendOpAOpc == ISD::ZERO_EXTEND \|\| ExtendOpAOpc == ISD::SIGN_EXTEND)))
	return Op;

	// Is the result of the right shift being truncated to the same value type as
	// the original operands, OpA and OpB?
	SDValue OpA = ExtendOpA.getOperand(0);
	SDValue OpB = ExtendOpB.getOperand(0);
	EVT OpAVT = OpA.getValueType();
	assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
	if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
	return Op;

	// Is the XOR using a constant amount of all ones in the right hand side?
	uint64_t C;
	if (!isAllConstantBuildVector(Xor.getOperand(1), C))
	return Op;

	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	APInt CAsAPInt(ElemSizeInBits, C);
	if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
	return Op;

	SDLoc DL(Op);
	bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
	unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
	SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);

	return ResultURHADD;
	}

	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	int64_t Cnt;

	if (!Op.getOperand(1).getValueType().isVector())
	return Op;
	unsigned EltSize = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (VT.isScalableVector())
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);

	if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
	MVT::i32),
	Op.getOperand(0), Op.getOperand(1));
	case ISD::SRA:
	case ISD::SRL:
	if (VT.isScalableVector()) {
	unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
	: AArch64ISD::SRL_MERGE_OP1;
	return LowerToPredicatedOp(Op, DAG, Opc);
	}

	// Right shift immediate
	if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
	unsigned Opc =
	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
	return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	}

	// Right shift register. Note, there is not a shift right register
	// instruction, but the shift left register instruction takes a signed
	// value, where negative numbers specify a right shift.
	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
	: Intrinsic::aarch64_neon_ushl;
	// negate the shift amount
	SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
	SDValue NegShiftLeft =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
	NegShift);
	return NegShiftLeft;
	}

	return SDValue();
	}

	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
	AArch64CC::CondCode CC, bool NoNans, EVT VT,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT SrcVT = LHS.getValueType();
	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	"function only supposed to emit natural comparisons");

	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
	bool IsZero = IsCnst && (CnstBits == 0);

	if (SrcVT.getVectorElementType().isFloatingPoint()) {
	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Fcmeq;
	if (IsZero)
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	else
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
	case AArch64CC::LS:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (!NoNans)
	return SDValue();
	// If we ignore NaNs then we can use to the MI implementation.
	LLVM_FALLTHROUGH;
	case AArch64CC::MI:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
	}
	}

	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Cmeq;
	if (IsZero)
	Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	else
	Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
	case AArch64CC::LE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
	case AArch64CC::LS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
	case AArch64CC::LO:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
	case AArch64CC::HI:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
	case AArch64CC::HS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
	}
	}

	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isScalableVector()) {
	if (Op.getOperand(0).getValueType().isFloatingPoint())
	return Op;
	return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
	}

	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
	SDLoc dl(Op);

	if (LHS.getValueType().getVectorElementType().isInteger()) {
	assert(LHS.getValueType() == RHS.getValueType());
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
	return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
	}

	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	// Make v4f16 (only) fcmp operations utilise vector instructions
	// v8f16 support will be a litle more complicated
	if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
	if (LHS.getValueType().getVectorNumElements() == 4) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
	SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
	DAG.ReplaceAllUsesWith(Op, NewSetcc);
	CmpVT = MVT::v4i32;
	} else
	return SDValue();
	}

	assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
	LHS.getValueType().getVectorElementType() != MVT::f128);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	AArch64CC::CondCode CC1, CC2;
	bool ShouldInvert;
	changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp.getNode())
	return SDValue();

	if (CC2 != AArch64CC::AL) {
	SDValue Cmp2 =
	EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp2.getNode())
	return SDValue();

	Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
	}

	Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());

	if (ShouldInvert)
	Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());

	return Cmp;
	}

	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
	SelectionDAG &DAG) {
	SDValue VecOp = ScalarOp.getOperand(0);
	auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
	DAG.getConstant(0, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	switch (Op.getOpcode()) {
	case ISD::VECREDUCE_ADD:
	return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
	case ISD::VECREDUCE_SMAX:
	return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_SMIN:
	return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
	case ISD::VECREDUCE_UMAX:
	return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_UMIN:
	return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
	case ISD::VECREDUCE_FMAX: {
	assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	case ISD::VECREDUCE_FMIN: {
	assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	default:
	llvm_unreachable("Unhandled reduction");
	}
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-add instruction, but not a load-sub.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-clear instruction, but not a load-and.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
	SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	// To match the actual intent better, we should read the output from X15 here
	// again (instead of potentially spilling it to the stack), but rereading Size
	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
	// here.

	Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	return Chain;
	}

	SDValue
	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() &&
	"Only Windows alloca probing supported");
	SDLoc dl(Op);
	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	MaybeAlign Align =
	cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
	EVT VT = Node->getValueType(0);

	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	"no-stack-arg-probe")) {
	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);

	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	assert(VT != MVT::i64 && "Expected illegal VSCALE node");

	SDLoc DL(Op);
	APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
	return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
	DL, VT);
	}

	/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
	template <unsigned NumVecs>
	static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
	const CallInst &CI) {
	Info.opc = ISD::INTRINSIC_VOID;
	// Retrieve EC from first vector argument.
	const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
	ElementCount EC = VT.getVectorElementCount();
	#ifndef NDEBUG
	// Check the assumption that all input vectors are the same type.
	for (unsigned I = 0; I < NumVecs; ++I)
	assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
	"Invalid type.");
	#endif
	// memVT is `NumVecs * VT`.
	Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
	EC * NumVecs);
	Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align.reset();
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	auto &DL = I.getModule()->getDataLayout();
	switch (Intrinsic) {
	case Intrinsic::aarch64_sve_st2:
	return setInfoSVEStN<2>(Info, I);
	case Intrinsic::aarch64_sve_st3:
	return setInfoSVEStN<3>(Info, I);
	case Intrinsic::aarch64_sve_st4:
	return setInfoSVEStN<4>(Info, I);
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align.reset();
	// volatile loads with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	unsigned NumElts = 0;
	for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align.reset();
	// volatile stores with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = Align(16);
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = Align(16);
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	case Intrinsic::aarch64_sve_ldnt1: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(I.getType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad;
	if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
	Info.flags \|= MachineMemOperand::MONonTemporal;
	return true;
	}
	case Intrinsic::aarch64_sve_stnt1: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(I.getOperand(0)->getType());
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = DL.getABITypeAlign(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore;
	if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
	Info.flags \|= MachineMemOperand::MONonTemporal;
	return true;
	}
	default:
	break;
	}

	return false;
	}

	bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// TODO: This may be worth removing. Check regression tests for diffs.
	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
	return false;

	// If we're reducing the load width in order to avoid having to use an extra
	// instruction to do extension then it's probably a good idea.
	if (ExtTy != ISD::NON_EXTLOAD)
	return true;
	// Don't reduce load width if it would prevent us from combining a shift into
	// the offset.
	MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
	assert(Mem);
	const SDValue &Base = Mem->getBasePtr();
	if (Base.getOpcode() == ISD::ADD &&
	Base.getOperand(1).getOpcode() == ISD::SHL &&
	Base.getOperand(1).hasOneUse() &&
	Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
	// The shift can be combined if it matches the size of the value being
	// loaded (and so reducing the width would make it not match).
	uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
	if (ShiftAmount == Log2_32(LoadBytes))
	return false;
	}
	// We have no reason to disallow reducing the load width, so allow it.
	return true;
	}

	// Truncations from 64-bit GPR to 32-bit GPR is free.
	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}
	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	/// Check if it is profitable to hoist instruction in then/else to if.
	/// Not profitable if I and it's user can form a FMA instruction
	/// because we prefer FMSUB/FMADD.
	bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (!I->hasOneUse())
	return true;

	Instruction *User = I->user_back();

	if (User &&
	!(User->getOpcode() == Instruction::FSub \|\|
	User->getOpcode() == Instruction::FAdd))
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const Function *F = I->getFunction();
	const DataLayout &DL = F->getParent()->getDataLayout();
	Type *Ty = User->getOperand(0)->getType();

	return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
	isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath));
	}

	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
	// 64-bit GPR.
	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}
	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}

	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2)) {
	return true;
	}

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
	VT1.getSizeInBits() <= 32);
	}

	bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
	if (isa<FPExtInst>(Ext))
	return false;

	// Vector types are not free.
	if (Ext->getType()->isVectorTy())
	return false;

	for (const Use &U : Ext->uses()) {
	// The extension is free if we can fold it with a left shift in an
	// addressing mode or an arithmetic operation: add, sub, and cmp.

	// Is there a shift?
	const Instruction *Instr = cast<Instruction>(U.getUser());

	// Is this a constant shift?
	switch (Instr->getOpcode()) {
	case Instruction::Shl:
	if (!isa<ConstantInt>(Instr->getOperand(1)))
	return false;
	break;
	case Instruction::GetElementPtr: {
	gep_type_iterator GTI = gep_type_begin(Instr);
	auto &DL = Ext->getModule()->getDataLayout();
	std::advance(GTI, U.getOperandNo()-1);
	Type *IdxTy = GTI.getIndexedType();
	// This extension will end up with a shift because of the scaling factor.
	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
	// Get the shift amount based on the scaling factor:
	// log2(sizeof(IdxTy)) - log2(8).
	uint64_t ShiftAmt =
	countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
	// Is the constant foldable in the shift of the addressing mode?
	// I.e., shift amount is between 1 and 4 inclusive.
	if (ShiftAmt == 0 \|\| ShiftAmt > 4)
	return false;
	break;
	}
	case Instruction::Trunc:
	// Check if this is a noop.
	// trunc(sext ty1 to ty2) to ty1.
	if (Instr->getType() == Ext->getOperand(0)->getType())
	continue;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	// At this point we can use the bfm family, so this extension is free
	// for that use.
	}
	return true;
	}

	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
	/// or upper half of the vector elements.
	static bool areExtractShuffleVectors(Value Op1, Value Op2) {
	auto areTypesHalfed = [](Value FullV, Value HalfV) {
	auto *FullTy = FullV->getType();
	auto *HalfTy = HalfV->getType();
	return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
	2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
	};

	auto extractHalf = [](Value FullV, Value HalfV) {
	auto *FullVT = cast<FixedVectorType>(FullV->getType());
	auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
	return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
	};

	ArrayRef<int> M1, M2;
	Value S1Op1, S2Op1;
	if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) \|\|
	!match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
	return false;

	// Check that the operands are half as wide as the result and we extract
	// half of the elements of the input vectors.
	if (!areTypesHalfed(S1Op1, Op1) \|\| !areTypesHalfed(S2Op1, Op2) \|\|
	!extractHalf(S1Op1, Op1) \|\| !extractHalf(S2Op1, Op2))
	return false;

	// Check the mask extracts either the lower or upper half of vector
	// elements.
	int M1Start = -1;
	int M2Start = -1;
	int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
	if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) \|\|
	!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) \|\|
	M1Start != M2Start \|\| (M1Start != 0 && M2Start != (NumElements / 2)))
	return false;

	return true;
	}

	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
	/// of the vector elements.
	static bool areExtractExts(Value Ext1, Value Ext2) {
	auto areExtDoubled = [](Instruction *Ext) {
	return Ext->getType()->getScalarSizeInBits() ==
	2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
	};

	if (!match(Ext1, m_ZExtOrSExt(m_Value())) \|\|
	!match(Ext2, m_ZExtOrSExt(m_Value())) \|\|
	!areExtDoubled(cast<Instruction>(Ext1)) \|\|
	!areExtDoubled(cast<Instruction>(Ext2)))
	return false;

	return true;
	}

	/// Check if Op could be used with vmull_high_p64 intrinsic.
	static bool isOperandOfVmullHighP64(Value *Op) {
	Value *VectorOperand = nullptr;
	ConstantInt *ElementIndex = nullptr;
	return match(Op, m_ExtractElt(m_Value(VectorOperand),
	m_ConstantInt(ElementIndex))) &&
	ElementIndex->getValue() == 1 &&
	isa<FixedVectorType>(VectorOperand->getType()) &&
	cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
	}

	/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
	static bool areOperandsOfVmullHighP64(Value Op1, Value Op2) {
	return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
	}

	/// Check if sinking \p I's operands to I's basic block is profitable, because
	/// the operands can be folded into a target instruction, e.g.
	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
	bool AArch64TargetLowering::shouldSinkOperands(
	Instruction I, SmallVectorImpl<Use > &Ops) const {
	if (!I->getType()->isVectorTy())
	return false;

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	case Intrinsic::aarch64_neon_umull:
	if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
	return false;
	Ops.push_back(&II->getOperandUse(0));
	Ops.push_back(&II->getOperandUse(1));
	return true;

	case Intrinsic::aarch64_neon_pmull64:
	if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
	II->getArgOperand(1)))
	return false;
	Ops.push_back(&II->getArgOperandUse(0));
	Ops.push_back(&II->getArgOperandUse(1));
	return true;

	default:
	return false;
	}
	}

	switch (I->getOpcode()) {
	case Instruction::Sub:
	case Instruction::Add: {
	if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
	return false;

	// If the exts' operands extract either the lower or upper elements, we
	// can sink them too.
	auto Ext1 = cast<Instruction>(I->getOperand(0));
	auto Ext2 = cast<Instruction>(I->getOperand(1));
	if (areExtractShuffleVectors(Ext1, Ext2)) {
	Ops.push_back(&Ext1->getOperandUse(0));
	Ops.push_back(&Ext2->getOperandUse(0));
	}

	Ops.push_back(&I->getOperandUse(0));
	Ops.push_back(&I->getOperandUse(1));

	return true;
	}
	default:
	return false;
	}
	return false;
	}

	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
	Align &RequiredAligment) const {
	if (!LoadedType.isSimple() \|\|
	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
	return false;
	// Cyclone supports unaligned accesses.
	RequiredAligment = Align(1);
	unsigned NumBits = LoadedType.getSizeInBits();
	return NumBits == 32 \|\| NumBits == 64;
	}

	/// A helper function for determining the number of interleaved accesses we
	/// will generate when lowering accesses of the given type.
	unsigned
	AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const {
	return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
	}

	MachineMemOperand::Flags
	AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
	I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
	return MOStridedAccess;
	return MachineMemOperand::MONone;
	}

	bool AArch64TargetLowering::isLegalInterleavedAccessType(
	VectorType *VecTy, const DataLayout &DL) const {

	unsigned VecSize = DL.getTypeSizeInBits(VecTy);
	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

	// Ensure the number of vector elements is greater than 1.
	if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
	return false;

	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return false;

	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
	// 128 will be split into multiple interleaved accesses.
	return VecSize == 64 \|\| VecSize % 128 == 0;
	}

	/// Lower an interleaved load into a ldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
	bool AArch64TargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	const DataLayout &DL = LI->getModule()->getDataLayout();

	VectorType *VTy = Shuffles[0]->getType();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(VTy, DL))
	return false;

	unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);

	auto *FVTy = cast<FixedVectorType>(VTy);

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	Type *EltTy = FVTy->getElementType();
	if (EltTy->isPointerTy())
	FVTy =
	FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());

	IRBuilder<> Builder(LI);

	// The base address of the load.
	Value *BaseAddr = LI->getPointerOperand();

	if (NumLoads > 1) {
	// If we're going to generate more than one load, reset the sub-vector type
	// to something legal.
	FVTy = FixedVectorType::get(FVTy->getElementType(),
	FVTy->getNumElements() / NumLoads);

	// We will compute the pointer operand of each load from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr,
	FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
	}

	Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
	Type *Tys[2] = {FVTy, PtrTy};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
	Intrinsic::aarch64_neon_ld3,
	Intrinsic::aarch64_neon_ld4};
	Function *LdNFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	// Holds sub-vectors extracted from the load intrinsic return values. The
	// sub-vectors are associated with the shufflevector instructions they will
	// replace.
	DenseMap<ShuffleVectorInst , SmallVector<Value , 4>> SubVecs;

	for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {

	// If we're generating more than one load, compute the base address of
	// subsequent loads as an offset from the previous.
	if (LoadCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
	FVTy->getNumElements() * Factor);

	CallInst *LdN = Builder.CreateCall(
	LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");

	// Extract and store the sub-vectors returned by the load intrinsic.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SVI = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(LdN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(
	SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
	FVTy->getNumElements()));
	SubVecs[SVI].push_back(SubVec);
	}
	}

	// Replace uses of the shufflevector instructions with the sub-vectors
	// returned by the load intrinsic. If a shufflevector instruction is
	// associated with more than one sub-vector, those sub-vectors will be
	// concatenated into a single wide vector.
	for (ShuffleVectorInst *SVI : Shuffles) {
	auto &SubVec = SubVecs[SVI];
	auto *WideVec =
	SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
	SVI->replaceAllUsesWith(WideVec);
	}

	return true;
	}

	/// Lower an interleaved store into a stN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// st3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	auto *VecTy = cast<FixedVectorType>(SVI->getType());
	assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");

	unsigned LaneLen = VecTy->getNumElements() / Factor;
	Type *EltTy = VecTy->getElementType();
	auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(SubVecTy, DL))
	return false;

	unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);
	unsigned NumOpElts =
	cast<FixedVectorType>(Op0->getType())->getNumElements();

	// Convert to the corresponding integer vector.
	auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = FixedVectorType::get(IntTy, LaneLen);
	}

	// The base address of the store.
	Value *BaseAddr = SI->getPointerOperand();

	if (NumStores > 1) {
	// If we're going to generate more than one store, reset the lane length
	// and sub-vector type to something legal.
	LaneLen /= NumStores;
	SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);

	// We will compute the pointer operand of each store from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr,
	SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
	}

	auto Mask = SVI->getShuffleMask();

	Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
	Type *Tys[2] = {SubVecTy, PtrTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
	Intrinsic::aarch64_neon_st3,
	Intrinsic::aarch64_neon_st4};
	Function *StNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {

	SmallVector<Value *, 5> Ops;

	// Split the shufflevector operands into sub vectors for the new stN call.
	for (unsigned i = 0; i < Factor; i++) {
	unsigned IdxI = StoreCount * LaneLen * Factor + i;
	if (Mask[IdxI] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	unsigned IdxJ = StoreCount * LaneLen * Factor + j;
	if (Mask[IdxJ * Factor + IdxI] >= 0) {
	StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
	break;
	}
	}
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in
	// isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
	}
	}

	// If we generating more than one store, we compute the base address of
	// subsequent stores as an offset from the previous.
	if (StoreCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
	BaseAddr, LaneLen * Factor);

	Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
	Builder.CreateCall(StNFunc, Ops);
	}
	return true;
	}

	// Lower an SVE structured load intrinsic returning a tuple type to target
	// specific intrinsic taking the same input but returning a multi-result value
	// of the split tuple type.
	//
	// E.g. Lowering an LD3:
	//
	// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
	// <vscale x 4 x i1> %pred,
	// <vscale x 4 x i32>* %addr)
	//
	// Output DAG:
	//
	// t0: ch = EntryToken
	// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
	// t4: i64,ch = CopyFromReg t0, Register:i64 %1
	// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
	// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
	//
	// This is called pre-legalization to avoid widening/splitting issues with
	// non-power-of-2 tuple types used for LD3, such as nxv12i32.
	SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
	ArrayRef<SDValue> LoadOps,
	EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) const {
	assert(VT.isScalableVector() && "Can only lower scalable vectors");

	unsigned N, Opcode;
	static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
	{Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
	{Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
	{Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};

	std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
	assert(VT.getVectorElementCount().Min % N == 0 &&
	"invalid tuple vector type!");

	EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
	VT.getVectorElementCount() / N);
	assert(isTypeLegal(SplitVT));

	SmallVector<EVT, 5> VTs(N, SplitVT);
	VTs.push_back(MVT::Other); // Chain
	SDVTList NodeTys = DAG.getVTList(VTs);

	SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
	SmallVector<SDValue, 4> PseudoLoadOps;
	for (unsigned I = 0; I < N; ++I)
	PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
	}

	EVT AArch64TargetLowering::getOptimalMemOpType(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	bool CanImplicitFloat =
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
	// taken one instruction to materialize the v2i64 zero and one store (with
	// restrictive addressing mode). Just do i64 stores.
	bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
	if (Op.isAligned(AlignCheck))
	return true;
	bool Fast;
	return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
	&Fast) &&
	Fast;
	};

	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
	AlignmentIsAcceptable(MVT::v2i64, Align(16)))
	return MVT::v2i64;
	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
	return MVT::f128;
	if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
	return MVT::i64;
	if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
	return MVT::i32;
	return MVT::Other;
	}

	LLT AArch64TargetLowering::getOptimalMemOpLLT(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	bool CanImplicitFloat =
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
	// taken one instruction to materialize the v2i64 zero and one store (with
	// restrictive addressing mode). Just do i64 stores.
	bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
	if (Op.isAligned(AlignCheck))
	return true;
	bool Fast;
	return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
	&Fast) &&
	Fast;
	};

	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
	AlignmentIsAcceptable(MVT::v2i64, Align(16)))
	return LLT::vector(2, 64);
	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
	return LLT::scalar(128);
	if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
	return LLT::scalar(64);
	if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
	return LLT::scalar(32);
	return LLT();
	}

	// 12-bit optionally shifted immediates are legal for adds.
	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
	if (Immed == std::numeric_limits<int64_t>::min()) {
	LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
	<< ": avoid UB for INT64_MIN\n");
	return false;
	}
	// Same encoding for add/sub, just flip the sign.
	Immed = std::abs(Immed);
	bool IsLegal = ((Immed >> 12) == 0 \|\|
	((Immed & 0xfff) == 0 && Immed >> 24 == 0));
	LLVM_DEBUG(dbgs() << "Is " << Immed
	<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
	return IsLegal;
	}

	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
	// immediates is the same as for an add or a sub.
	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
	return isLegalAddImmediate(Immed);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// AArch64 has five basic addressing modes:
	// reg
	// reg + 9-bit signed offset
	// reg + SIZE_IN_BYTES * 12-bit unsigned offset
	// reg1 + reg2
	// reg + SIZE_IN_BYTES * reg

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// No reg+reg+imm addressing.
	if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
	return false;

	// FIXME: Update this method to support scalable addressing modes.
	if (isa<ScalableVectorType>(Ty))
	return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;

	// check reg + imm case:
	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
	uint64_t NumBytes = 0;
	if (Ty->isSized()) {
	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
	NumBytes = NumBits / 8;
	if (!isPowerOf2_64(NumBits))
	NumBytes = 0;
	}

	if (!AM.Scale) {
	int64_t Offset = AM.BaseOffs;

	// 9-bit signed offset
	if (isInt<9>(Offset))
	return true;

	// 12-bit unsigned offset
	unsigned shift = Log2_64(NumBytes);
	if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
	// Must be a multiple of NumBytes (NumBytes is a power of 2)
	(Offset >> shift) << shift == Offset)
	return true;
	return false;
	}

	// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

	return AM.Scale == 1 \|\| (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
	}

	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
	// Consider splitting large offset of struct or array.
	return true;
	}

	int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// Operands \| Rt Latency
	// -------------------------------------------
	// Rt, [Xn, Xm] \| 4
	// -------------------------------------------
	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
	// Rt, [Xn, Wm, <extend> #imm] \|
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1 if
	// it is not equal to 0 or 1.
	return AM.Scale != 0 && AM.Scale != 1;
	return -1;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
	const MachineFunction &MF, EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
	Type *Ty) const {
	switch (Ty->getScalarType()->getTypeID()) {
	case Type::FloatTyID:
	case Type::DoubleTyID:
	return true;
	default:
	return false;
	}
	}

	const MCPhysReg *
	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints.
	static const MCPhysReg ScratchRegs[] = {
	AArch64::X16, AArch64::X17, AArch64::LR, 0
	};
	return ScratchRegs;
	}

	bool
	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const {
	N = N->getOperand(0).getNode();
	EVT VT = N->getValueType(0);
	// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
	// it with shift to let it be lowered to UBFX.
	if (N->getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	uint64_t TruncMask = N->getConstantOperandVal(1);
	if (isMask_64(TruncMask) &&
	N->getOperand(0).getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
	return false;
	}
	return true;
	}

	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return false;

	int64_t Val = Imm.getSExtValue();
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, BitSize))
	return true;

	if ((int64_t)Val < 0)
	Val = ~Val;
	if (BitSize == 32)
	Val &= (1LL << 32) - 1;

	unsigned LZ = countLeadingZeros((uint64_t)Val);
	unsigned Shift = (63 - LZ) / 16;
	// MOVZ is free so return true for one or fewer MOVK.
	return Shift < 3;
	}

	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// cmge X, X, #0
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	if (!Subtarget->hasNEON() \|\| !VT.isVector())
	return SDValue();

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
	}

	// Generate SUBS and CSEL for integer abs.
	static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CSEL.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
	if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
	if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	// Generate SUBS & CSEL.
	SDValue Cmp =
	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
	N0.getOperand(0), DAG.getConstant(0, DL, VT));
	return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
	SDValue(Cmp.getNode(), 1));
	}
	return SDValue();
	}

	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	return performIntegerAbsCombine(N, DAG);
	}

	SDValue
	AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	unsigned Lg2 = Divisor.countTrailingZeros();
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

	// Add (N0 < 0) ? Pow2 - 1 : 0;
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CSel.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	static bool IsSVECntIntrinsic(SDValue S) {
	switch(getIntrinsicID(S.getNode())) {
	default:
	break;
	case Intrinsic::aarch64_sve_cntb:
	case Intrinsic::aarch64_sve_cnth:
	case Intrinsic::aarch64_sve_cntw:
	case Intrinsic::aarch64_sve_cntd:
	return true;
	}
	return false;
	}

	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// The below optimizations require a constant RHS.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
	const APInt &ConstValue = C->getAPIntValue();

	// Allow the scaling to be folded into the `cnt` instruction by preventing
	// the scaling to be obscured here. This makes it easier to pattern match.
	if (IsSVECntIntrinsic(N0) \|\|
	(N0->getOpcode() == ISD::TRUNCATE &&
	(IsSVECntIntrinsic(N0->getOperand(0)))))
	if (ConstValue.sge(1) && ConstValue.sle(16))
	return SDValue();

	// Multiplication of a power of two plus/minus one can be done more
	// cheaply as as shift+add/sub. For now, this is true unilaterally. If
	// future CPUs have a cheaper MADD instruction, this may need to be
	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
	// 64-bit is 5 cycles, so this is always a win.
	// More aggressively, some multiplications N0 * C can be lowered to
	// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
	// e.g. 6=32=(2+1)2.
	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
	// which equals to (1+2)*16-(1+2).
	// TrailingZeroes is used to test if the mul can be lowered to
	// shift+add+shift.
	unsigned TrailingZeroes = ConstValue.countTrailingZeros();
	if (TrailingZeroes) {
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into smul or umul.
	if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) \|\|
	isZeroExtended(N0.getNode(), DAG)))
	return SDValue();
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into madd or msub.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
	N->use_begin()->getOpcode() == ISD::SUB))
	return SDValue();
	}
	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
	// and shift+add+shift.
	APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

	unsigned ShiftAmt, AddSubOpc;
	// Is the shifted value the LHS operand of the add/sub?
	bool ShiftValUseIsN0 = true;
	// Do we need to negate the result?
	bool NegateResult = false;

	if (ConstValue.isNonNegative()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
	APInt SCVMinus1 = ShiftedConstValue - 1;
	APInt CVPlus1 = ConstValue + 1;
	if (SCVMinus1.isPowerOf2()) {
	ShiftAmt = SCVMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	} else if (CVPlus1.isPowerOf2()) {
	ShiftAmt = CVPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	} else
	return SDValue();
	} else {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	APInt CVNegPlus1 = -ConstValue + 1;
	APInt CVNegMinus1 = -ConstValue - 1;
	if (CVNegPlus1.isPowerOf2()) {
	ShiftAmt = CVNegPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	ShiftValUseIsN0 = false;
	} else if (CVNegMinus1.isPowerOf2()) {
	ShiftAmt = CVNegMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	NegateResult = true;
	} else
	return SDValue();
	}

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(ShiftAmt, DL, MVT::i64));

	SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
	SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
	SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
	assert(!(NegateResult && TrailingZeroes) &&
	"NegateResult and TrailingZeroes cannot both be true for now.");
	// Negate the result.
	if (NegateResult)
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	// Shift the result.
	if (TrailingZeroes)
	return DAG.getNode(ISD::SHL, DL, VT, Res,
	DAG.getConstant(TrailingZeroes, DL, MVT::i64));
	return Res;
	}

	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	// First try to optimize away the conversion when it's conditionally from
	// a constant. Vectors only.
	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
	return Res;

	EVT VT = N->getValueType(0);
	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Only optimize when the source and destination types have the same width.
	if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// If the result of an integer load is only used by an integer-to-float
	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
	SDValue N0 = N->getOperand(0);
	if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags());

	// Make sure successors of the original load stay after it by updating them
	// to use the new Chain.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));

	unsigned Opcode =
	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
	return DAG.getNode(Opcode, SDLoc(N), VT, Load);
	}

	return SDValue();
	}

	/// Fold a floating-point multiply by power of two into floating-point to
	/// fixed-point conversion.
	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	if (!N->getValueType(0).isSimple())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t Bits = IntBits == 64 ? 64 : 32;
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
	if (C == -1 \|\| C == 0 \|\| C > Bits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	assert((ResTy != MVT::v4i64 \|\| DCI.isBeforeLegalizeOps()) &&
	"Illegal vector type after legalization");

	SDLoc DL(N);
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
	: Intrinsic::aarch64_neon_vcvtfp2fxu;
	SDValue FixConv =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
	Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
	// We can handle smaller integers by generating an extra trunc.
	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// Fold a floating-point divide by power of two into fixed-point to
	/// floating-point conversion.
	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned Opc = Op->getOpcode();
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	!Op.getOperand(0).getValueType().isSimple() \|\|
	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	int32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	int32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
	if (C == -1 \|\| C == 0 \|\| C > FloatBits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue ConvInput = Op.getOperand(0);
	bool IsSigned = Opc == ISD::SINT_TO_FP;
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
	ResTy, ConvInput);

	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
	: Intrinsic::aarch64_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
	DAG.getConstant(C, DL, MVT::i32));
	}

	/// An EXTR instruction is made up of two shifts, ORed together. This helper
	/// searches for and classifies those shifts.
	static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
	bool &FromHi) {
	if (N.getOpcode() == ISD::SHL)
	FromHi = false;
	else if (N.getOpcode() == ISD::SRL)
	FromHi = true;
	else
	return false;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	return false;

	ShiftAmount = N->getConstantOperandVal(1);
	Src = N->getOperand(0);
	return true;
	}

	/// EXTR instruction extracts a contiguous chunk of bits from two existing
	/// registers viewed as a high/low pair. This function looks for the pattern:
	/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
	/// with an EXTR. Can't quite be done in TableGen because the two immediates
	/// aren't independent.
	static SDValue tryCombineToEXTR(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	assert(N->getOpcode() == ISD::OR && "Unexpected root");

	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	SDValue LHS;
	uint32_t ShiftLHS = 0;
	bool LHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
	return SDValue();

	SDValue RHS;
	uint32_t ShiftRHS = 0;
	bool RHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
	return SDValue();

	// If they're both trying to come from the high part of the register, they're
	// not really an EXTR.
	if (LHSFromHi == RHSFromHi)
	return SDValue();

	if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
	return SDValue();

	if (LHSFromHi) {
	std::swap(LHS, RHS);
	std::swap(ShiftLHS, ShiftRHS);
	}

	return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
	DAG.getConstant(ShiftRHS, DL, MVT::i64));
	}

	static SDValue tryCombineToBSL(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND)
	return SDValue();

	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() != ISD::AND)
	return SDValue();

	// We only have to look for constant vectors here since the general, variable
	// case can be handled in TableGen.
	unsigned Bits = VT.getScalarSizeInBits();
	uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
	for (int i = 1; i >= 0; --i)
	for (int j = 1; j >= 0; --j) {
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
	if (!BVN0 \|\| !BVN1)
	continue;

	bool FoundMatch = true;
	for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
	if (!CN0 \|\| !CN1 \|\|
	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
	FoundMatch = false;
	break;
	}
	}

	if (FoundMatch)
	return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
	N0->getOperand(1 - i), N1->getOperand(1 - j));
	}

	return SDValue();
	}

	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (SDValue Res = tryCombineToEXTR(N, DCI))
	return Res;

	if (SDValue Res = tryCombineToBSL(N, DCI))
	return Res;

	return SDValue();
	}

	static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
	if (!MemVT.getVectorElementType().isSimple())
	return false;

	uint64_t MaskForTy = 0ull;
	switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
	case MVT::i8:
	MaskForTy = 0xffull;
	break;
	case MVT::i16:
	MaskForTy = 0xffffull;
	break;
	case MVT::i32:
	MaskForTy = 0xffffffffull;
	break;
	default:
	return false;
	break;
	}

	if (N->getOpcode() == AArch64ISD::DUP \|\| N->getOpcode() == ISD::SPLAT_VECTOR)
	if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
	return Op0->getAPIntValue().getLimitedValue() == MaskForTy;

	return false;
	}

	static SDValue performSVEAndCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDValue Src = N->getOperand(0);
	unsigned Opc = Src->getOpcode();

	// Zero/any extend of an unsigned unpack
	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
	SDValue UnpkOp = Src->getOperand(0);
	SDValue Dup = N->getOperand(1);

	if (Dup.getOpcode() != AArch64ISD::DUP)
	return SDValue();

	SDLoc DL(N);
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
	uint64_t ExtVal = C->getZExtValue();

	// If the mask is fully covered by the unpack, we don't need to push
	// a new AND onto the operand
	EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
	if ((ExtVal == 0xFF && EltTy == MVT::i8) \|\|
	(ExtVal == 0xFFFF && EltTy == MVT::i16) \|\|
	(ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
	return Src;

	// Truncate to prevent a DUP with an over wide constant
	APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());

	// Otherwise, make sure we propagate the AND to the operand
	// of the unpack
	Dup = DAG.getNode(AArch64ISD::DUP, DL,
	UnpkOp->getValueType(0),
	DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));

	SDValue And = DAG.getNode(ISD::AND, DL,
	UnpkOp->getValueType(0), UnpkOp, Dup);

	return DAG.getNode(Opc, DL, N->getValueType(0), And);
	}

	SDValue Mask = N->getOperand(1);

	if (!Src.hasOneUse())
	return SDValue();

	EVT MemVT;

	// SVE load instructions perform an implicit zero-extend, which makes them
	// perfect candidates for combining.
	switch (Opc) {
	case AArch64ISD::LD1_MERGE_ZERO:
	case AArch64ISD::LDNF1_MERGE_ZERO:
	case AArch64ISD::LDFF1_MERGE_ZERO:
	MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
	break;
	case AArch64ISD::GLD1_MERGE_ZERO:
	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
	case AArch64ISD::GLDFF1_MERGE_ZERO:
	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
	case AArch64ISD::GLDNT1_MERGE_ZERO:
	MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
	break;
	default:
	return SDValue();
	}

	if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
	return Src;

	return SDValue();
	}

	static SDValue performANDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue LHS = N->getOperand(0);
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| !DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (VT.isScalableVector())
	return performSVEAndCombine(N, DCI);

	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
	if (!BVN)
	return SDValue();

	// AND does not accept an immediate, so check if we can use a BIC immediate
	// instruction instead. We do this here instead of using a (and x, (mvni imm))
	// pattern in isel, because some immediates may be lowered to the preferred
	// (and x, (movi imm)) form, even though an mvni representation also exists.
	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)))
	return NewOp;

	UndefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	return SDValue();
	}

	static SDValue performSRLCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
	// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
	// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() == ISD::BSWAP) {
	SDLoc DL(N);
	SDValue N1 = N->getOperand(1);
	SDValue N00 = N0.getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	uint64_t ShiftAmt = C->getZExtValue();
	if (VT == MVT::i32 && ShiftAmt == 16 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	if (VT == MVT::i64 && ShiftAmt == 32 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	}
	}
	return SDValue();
	}

	static SDValue performConcatVectorsCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
	unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();

	// Optimize concat_vectors of truncated vectors, where the intermediate
	// type is illegal, to avoid said illegality, e.g.,
	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
	// (v2i16 (truncate (v2i64)))))
	// ->
	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
	// (v4i32 (bitcast (v2i64))),
	// <0, 2, 4, 6>)))
	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
	// on both input and result type, so we might generate worse code.
	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
	if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
	N1Opc == ISD::TRUNCATE) {
	SDValue N00 = N0->getOperand(0);
	SDValue N10 = N1->getOperand(0);
	EVT N00VT = N00.getValueType();

	if (N00VT == N10.getValueType() &&
	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
	N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
	SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
	for (size_t i = 0; i < Mask.size(); ++i)
	Mask[i] = i * 2;
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getVectorShuffle(
	MidVT, dl,
	DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
	DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
	}
	}

	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// Optimise concat_vectors of two [us]rhadds that use extracted subvectors
	// from the same original vectors. Combine these into a single [us]rhadd that
	// operates on the two original vectors. Example:
	// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
	// extract_subvector (v16i8 OpB,
	// <0>))),
	// (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
	// extract_subvector (v16i8 OpB,
	// <8>)))))
	// ->
	// (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
	if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
	(N0Opc == AArch64ISD::URHADD \|\| N0Opc == AArch64ISD::SRHADD)) {
	SDValue N00 = N0->getOperand(0);
	SDValue N01 = N0->getOperand(1);
	SDValue N10 = N1->getOperand(0);
	SDValue N11 = N1->getOperand(1);

	EVT N00VT = N00.getValueType();
	EVT N10VT = N10.getValueType();

	if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
	SDValue N00Source = N00->getOperand(0);
	SDValue N01Source = N01->getOperand(0);
	SDValue N10Source = N10->getOperand(0);
	SDValue N11Source = N11->getOperand(0);

	if (N00Source == N10Source && N01Source == N11Source &&
	N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
	assert(N0.getValueType() == N1.getValueType());

	uint64_t N00Index = N00.getConstantOperandVal(1);
	uint64_t N01Index = N01.getConstantOperandVal(1);
	uint64_t N10Index = N10.getConstantOperandVal(1);
	uint64_t N11Index = N11.getConstantOperandVal(1);

	if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
	N10Index == N00VT.getVectorNumElements())
	return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
	}
	}
	}

	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
	// canonicalise to that.
	if (N0 == N1 && VT.getVectorNumElements() == 2) {
	assert(VT.getScalarSizeInBits() == 64);
	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
	DAG.getConstant(0, dl, MVT::i64));
	}

	// Canonicalise concat_vectors so that the right-hand vector has as few
	// bit-casts as possible before its real operation. The primary matching
	// destination for these operations will be the narrowing "2" instructions,
	// which depend on the operation being performed on this right-hand vector.
	// For example,
	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
	// becomes
	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

	if (N1Opc != ISD::BITCAST)
	return SDValue();
	SDValue RHS = N1->getOperand(0);
	MVT RHSTy = RHS.getValueType().getSimpleVT();
	// If the RHS is not a vector, this is not the pattern we're looking for.
	if (!RHSTy.isVector())
	return SDValue();

	LLVM_DEBUG(
	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");

	MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
	RHSTy.getVectorNumElements() * 2);
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
	DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
	RHS));
	}

	static SDValue tryCombineFixedPointConvert(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait until after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();
	// Transform a scalar conversion of a value from a lane extract into a
	// lane extract of a vector conversion. E.g., from foo1 to foo2:
	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
	//
	// The second form interacts better with instruction selection and the
	// register allocator to avoid cross-class register copies that aren't
	// coalescable due to a lane reference.

	// Check the operand and see if it originates from a lane extract.
	SDValue Op1 = N->getOperand(1);
	if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	// Yep, no additional predication needed. Perform the transform.
	SDValue IID = N->getOperand(0);
	SDValue Shift = N->getOperand(2);
	SDValue Vec = Op1.getOperand(0);
	SDValue Lane = Op1.getOperand(1);
	EVT ResTy = N->getValueType(0);
	EVT VecResTy;
	SDLoc DL(N);

	// The vector width should be 128 bits by the time we get here, even
	// if it started as 64 bits (the extract_vector handling will have
	// done so).
	assert(Vec.getValueSizeInBits() == 128 &&
	"unexpected vector size on extract_vector_elt!");
	if (Vec.getValueType() == MVT::v4i32)
	VecResTy = MVT::v4f32;
	else if (Vec.getValueType() == MVT::v2i64)
	VecResTy = MVT::v2f64;
	else
	llvm_unreachable("unexpected vector type!");

	SDValue Convert =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
	}
	return SDValue();
	}

	// AArch64 high-vector "long" operations are formed by performing the non-high
	// version on an extract_subvector of each operand which gets the high half:
	//
	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
	//
	// However, there are cases which don't have an extract_high explicitly, but
	// have another operation that can be made compatible with one for free. For
	// example:
	//
	// (dupv64 scalar) --> (extract_high (dup128 scalar))
	//
	// This routine does the actual conversion of such DUPs, once outer routines
	// have determined that everything else is in order.
	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
	// similarly here.
	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
	switch (N.getOpcode()) {
	case AArch64ISD::DUP:
	case AArch64ISD::DUPLANE8:
	case AArch64ISD::DUPLANE16:
	case AArch64ISD::DUPLANE32:
	case AArch64ISD::DUPLANE64:
	case AArch64ISD::MOVI:
	case AArch64ISD::MOVIshift:
	case AArch64ISD::MOVIedit:
	case AArch64ISD::MOVImsl:
	case AArch64ISD::MVNIshift:
	case AArch64ISD::MVNImsl:
	break;
	default:
	// FMOV could be supported, but isn't very useful, as it would only occur
	// if you passed a bitcast' floating point immediate to an eligible long
	// integer op (addl, smull, ...).
	return SDValue();
	}

	MVT NarrowTy = N.getSimpleValueType();
	if (!NarrowTy.is64BitVector())
	return SDValue();

	MVT ElementTy = NarrowTy.getVectorElementType();
	unsigned NumElems = NarrowTy.getVectorNumElements();
	MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
	DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
	DAG.getConstant(NumElems, dl, MVT::i64));
	}

	static bool isEssentiallyExtractHighSubvector(SDValue N) {
	if (N.getOpcode() == ISD::BITCAST)
	N = N.getOperand(0);
	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
	N.getOperand(0).getValueType().getVectorNumElements() / 2;
	}

	/// Helper structure to keep track of ISD::SET_CC operands.
	struct GenericSetCCInfo {
	const SDValue *Opnd0;
	const SDValue *Opnd1;
	ISD::CondCode CC;
	};

	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
	struct AArch64SetCCInfo {
	const SDValue *Cmp;
	AArch64CC::CondCode CC;
	};

	/// Helper structure to keep track of SetCC information.
	union SetCCInfo {
	GenericSetCCInfo Generic;
	AArch64SetCCInfo AArch64;
	};

	/// Helper structure to be able to read SetCC information. If set to
	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
	/// GenericSetCCInfo.
	struct SetCCInfoAndKind {
	SetCCInfo Info;
	bool IsAArch64;
	};

	/// Check whether or not \p Op is a SET_CC operation, either a generic or
	/// an
	/// AArch64 lowered one.
	/// \p SetCCInfo is filled accordingly.
	/// \post SetCCInfo is meanginfull only when this function returns true.
	/// \return True when Op is a kind of SET_CC operation.
	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
	// If this is a setcc, this is straight forward.
	if (Op.getOpcode() == ISD::SETCC) {
	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SetCCInfo.IsAArch64 = false;
	return true;
	}
	// Otherwise, check if this is a matching csel instruction.
	// In other words:
	// - csel 1, 0, cc
	// - csel 0, 1, !cc
	if (Op.getOpcode() != AArch64ISD::CSEL)
	return false;
	// Set the information about the operands.
	// TODO: we want the operands of the Cmp not the csel
	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
	SetCCInfo.IsAArch64 = true;
	SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// Check that the operands matches the constraints:
	// (1) Both operands must be constants.
	// (2) One must be 1 and the other must be 0.
	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));

	// Check (1).
	if (!TValue \|\| !FValue)
	return false;

	// Check (2).
	if (!TValue->isOne()) {
	// Update the comparison when we are interested in !cc.
	std::swap(TValue, FValue);
	SetCCInfo.Info.AArch64.CC =
	AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
	}
	return TValue->isOne() && FValue->isNullValue();
	}

	// Returns true if Op is setcc or zext of setcc.
	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
	if (isSetCC(Op, Info))
	return true;
	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
	isSetCC(Op->getOperand(0), Info));
	}

	// The folding we want to perform is:
	// (add x, [zext] (setcc cc ...) )
	// -->
	// (csel x, (add x, 1), !cc ...)
	//
	// The latter will get matched to a CSINC instruction.
	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
	SDValue LHS = Op->getOperand(0);
	SDValue RHS = Op->getOperand(1);
	SetCCInfoAndKind InfoAndKind;

	// If neither operand is a SET_CC, give up.
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
	std::swap(LHS, RHS);
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
	return SDValue();
	}

	// FIXME: This could be generatized to work for FP comparisons.
	EVT CmpVT = InfoAndKind.IsAArch64
	? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
	: InfoAndKind.Info.Generic.Opnd0->getValueType();
	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
	return SDValue();

	SDValue CCVal;
	SDValue Cmp;
	SDLoc dl(Op);
	if (InfoAndKind.IsAArch64) {
	CCVal = DAG.getConstant(
	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
	MVT::i32);
	Cmp = *InfoAndKind.Info.AArch64.Cmp;
	} else
	Cmp = getAArch64Cmp(
	InfoAndKind.Info.Generic.Opnd0, InfoAndKind.Info.Generic.Opnd1,
	ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
	dl);

	EVT VT = Op->getValueType(0);
	LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
	}

	// The basic add/sub long vector instructions have variants with "2" on the end
	// which act on the high-half of their inputs. They are normally matched by
	// patterns like:
	//
	// (add (zeroext (extract_high LHS)),
	// (zeroext (extract_high RHS)))
	// -> uaddl2 vD, vN, vM
	//
	// However, if one of the extracts is something like a duplicate, this
	// instruction can still be used profitably. This function puts the DAG into a
	// more appropriate form for those patterns to trigger.
	static SDValue performAddSubLongCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector()) {
	if (N->getOpcode() == ISD::ADD)
	return performSetccAddFolding(N, DAG);
	return SDValue();
	}

	// Make sure both branches are extended in the same way.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
	LHS.getOpcode() != RHS.getOpcode())
	return SDValue();

	unsigned ExtType = LHS.getOpcode();

	// It's not worth doing if at least one of the inputs isn't already an
	// extract, but we don't know which it'll be so we have to try both.
	if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
	RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
	if (!RHS.getNode())
	return SDValue();

	RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
	} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
	LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
	if (!LHS.getNode())
	return SDValue();

	LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
	}

	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
	}

	// Massage DAGs which we can use the high-half "long" operations on into
	// something isel will recognize better. E.g.
	//
	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
	// (aarch64_neon_umull (extract_high (v2i64 vec)))
	// (extract_high (v2i64 (dup128 scalar)))))
	//
	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	assert(LHS.getValueType().is64BitVector() &&
	RHS.getValueType().is64BitVector() &&
	"unexpected shape for long operation");

	// Either node could be a DUP, but it's not worth doing both of them (you'd
	// just as well use the non-high version) so look for a corresponding extract
	// operation on the other "wing".
	if (isEssentiallyExtractHighSubvector(LHS)) {
	RHS = tryExtendDUPToExtractHigh(RHS, DAG);
	if (!RHS.getNode())
	return SDValue();
	} else if (isEssentiallyExtractHighSubvector(RHS)) {
	LHS = tryExtendDUPToExtractHigh(LHS, DAG);
	if (!LHS.getNode())
	return SDValue();
	}

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
	N->getOperand(0), LHS, RHS);
	}

	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
	MVT ElemTy = N->getSimpleValueType(0).getScalarType();
	unsigned ElemBits = ElemTy.getSizeInBits();

	int64_t ShiftAmount;
	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElemBits) \|\|
	SplatBitSize != ElemBits)
	return SDValue();

	ShiftAmount = SplatValue.getSExtValue();
	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	ShiftAmount = CVN->getSExtValue();
	} else
	return SDValue();

	unsigned Opcode;
	bool IsRightShift;
	switch (IID) {
	default:
	llvm_unreachable("Unknown shift intrinsic");
	case Intrinsic::aarch64_neon_sqshl:
	Opcode = AArch64ISD::SQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_uqshl:
	Opcode = AArch64ISD::UQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_srshl:
	Opcode = AArch64ISD::SRSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_urshl:
	Opcode = AArch64ISD::URSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_sqshlu:
	Opcode = AArch64ISD::SQSHLU_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_sshl:
	case Intrinsic::aarch64_neon_ushl:
	// For positive shift amounts we can use SHL, as ushl/sshl perform a regular
	// left shift for positive shift amounts. Below, we only replace the current
	// node with VSHL, if this condition is met.
	Opcode = AArch64ISD::VSHL;
	IsRightShift = false;
	break;
	}

	if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
	} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	return SDValue();
	}

	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
	// the intrinsics must be legal and take an i32, this means there's almost
	// certainly going to be a zext in the DAG which we can eliminate.
	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
	SDValue AndN = N->getOperand(2);
	if (AndN.getOpcode() != ISD::AND)
	return SDValue();

	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
	if (!CMask \|\| CMask->getZExtValue() != Mask)
	return SDValue();

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
	N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
	}

	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
	DAG.getNode(Opc, dl,
	N->getOperand(1).getSimpleValueType(),
	N->getOperand(1)),
	DAG.getConstant(0, dl, MVT::i64));
	}

	static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	LLVMContext &Ctx = *DAG.getContext();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	SDValue Pred = N->getOperand(1);
	SDValue Data = N->getOperand(2);
	EVT DataVT = Data.getValueType();

	if (DataVT.getVectorElementType().isScalarInteger() &&
	(VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64)) {
	if (!TLI.isTypeLegal(DataVT))
	return SDValue();

	EVT OutputVT = EVT::getVectorVT(Ctx, VT,
	AArch64::NeonBitsPerVector / VT.getSizeInBits());
	SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
	SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);

	return Result;
	}

	return SDValue();
	}

	static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op1 = N->getOperand(1);
	SDValue Op2 = N->getOperand(2);
	EVT ScalarTy = Op1.getValueType();

	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16)) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	}

	return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
	Op1, Op2);
	}

	static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
	SDLoc dl(N);
	SDValue Scalar = N->getOperand(3);
	EVT ScalarTy = Scalar.getValueType();

	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
	Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);

	SDValue Passthru = N->getOperand(1);
	SDValue Pred = N->getOperand(2);
	return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
	Pred, Scalar, Passthru);
	}

	static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
	SDLoc dl(N);
	LLVMContext &Ctx = *DAG.getContext();
	EVT VT = N->getValueType(0);

	assert(VT.isScalableVector() && "Expected a scalable vector.");

	// Current lowering only supports the SVE-ACLE types.
	if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
	return SDValue();

	unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
	unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
	EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });

	// Convert everything to the domain of EXT (i.e bytes).
	SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
	SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
	DAG.getConstant(ElemSize, dl, MVT::i32));

	SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
	return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
	}

	static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDValue Comparator = N->getOperand(3);
	if (Comparator.getOpcode() == AArch64ISD::DUP \|\|
	Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
	unsigned IID = getIntrinsicID(N);
	EVT VT = N->getValueType(0);
	EVT CmpVT = N->getOperand(2).getValueType();
	SDValue Pred = N->getOperand(1);
	SDValue Imm;
	SDLoc DL(N);

	switch (IID) {
	default:
	llvm_unreachable("Called with wrong intrinsic!");
	break;

	// Signed comparisons
	case Intrinsic::aarch64_sve_cmpeq_wide:
	case Intrinsic::aarch64_sve_cmpne_wide:
	case Intrinsic::aarch64_sve_cmpge_wide:
	case Intrinsic::aarch64_sve_cmpgt_wide:
	case Intrinsic::aarch64_sve_cmplt_wide:
	case Intrinsic::aarch64_sve_cmple_wide: {
	if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
	int64_t ImmVal = CN->getSExtValue();
	if (ImmVal >= -16 && ImmVal <= 15)
	Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
	else
	return SDValue();
	}
	break;
	}
	// Unsigned comparisons
	case Intrinsic::aarch64_sve_cmphs_wide:
	case Intrinsic::aarch64_sve_cmphi_wide:
	case Intrinsic::aarch64_sve_cmplo_wide:
	case Intrinsic::aarch64_sve_cmpls_wide: {
	if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
	uint64_t ImmVal = CN->getZExtValue();
	if (ImmVal <= 127)
	Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
	else
	return SDValue();
	}
	break;
	}
	}

	if (!Imm)
	return SDValue();

	SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
	N->getOperand(2), Splat, DAG.getCondCode(CC));
	}

	return SDValue();
	}

	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
	AArch64CC::CondCode Cond) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	SDLoc DL(Op);
	assert(Op.getValueType().isScalableVector() &&
	TLI.isTypeLegal(Op.getValueType()) &&
	"Expected legal scalable vector type!");

	// Ensure target specific opcodes are using legal type.
	EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue TVal = DAG.getConstant(1, DL, OutVT);
	SDValue FVal = DAG.getConstant(0, DL, OutVT);

	// Set condition code (CC) flags.
	SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);

	// Convert CC to integer based on requested condition.
	// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
	SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
	SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
	return DAG.getZExtOrTrunc(Res, DL, VT);
	}

	static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
	SelectionDAG &DAG) {
	SDLoc DL(N);

	SDValue Pred = N->getOperand(1);
	SDValue VecToReduce = N->getOperand(2);

	EVT ReduceVT = VecToReduce.getValueType();
	SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);

	// SVE reductions set the whole vector register with the first element
	// containing the reduction result, which we'll now extract.
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
	Zero);
	}

	static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
	SelectionDAG &DAG) {
	SDLoc DL(N);

	SDValue Pred = N->getOperand(1);
	SDValue InitVal = N->getOperand(2);
	SDValue VecToReduce = N->getOperand(3);
	EVT ReduceVT = VecToReduce.getValueType();

	// Ordered reductions use the first lane of the result vector as the
	// reduction's initial value.
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
	DAG.getUNDEF(ReduceVT), InitVal, Zero);

	SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);

	// SVE reductions set the whole vector register with the first element
	// containing the reduction result, which we'll now extract.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
	Zero);
	}

	static SDValue performIntrinsicCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	unsigned IID = getIntrinsicID(N);
	switch (IID) {
	default:
	break;
	case Intrinsic::aarch64_neon_vcvtfxs2fp:
	case Intrinsic::aarch64_neon_vcvtfxu2fp:
	return tryCombineFixedPointConvert(N, DCI, DAG);
	case Intrinsic::aarch64_neon_saddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
	case Intrinsic::aarch64_neon_uaddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
	case Intrinsic::aarch64_neon_sminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
	case Intrinsic::aarch64_neon_uminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
	case Intrinsic::aarch64_neon_smaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
	case Intrinsic::aarch64_neon_umaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
	case Intrinsic::aarch64_neon_fmax:
	return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmin:
	return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmaxnm:
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fminnm:
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	case Intrinsic::aarch64_neon_pmull:
	case Intrinsic::aarch64_neon_sqdmull:
	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
	case Intrinsic::aarch64_neon_sqshl:
	case Intrinsic::aarch64_neon_uqshl:
	case Intrinsic::aarch64_neon_sqshlu:
	case Intrinsic::aarch64_neon_srshl:
	case Intrinsic::aarch64_neon_urshl:
	case Intrinsic::aarch64_neon_sshl:
	case Intrinsic::aarch64_neon_ushl:
	return tryCombineShiftImm(IID, N, DAG);
	case Intrinsic::aarch64_crc32b:
	case Intrinsic::aarch64_crc32cb:
	return tryCombineCRC32(0xff, N, DAG);
	case Intrinsic::aarch64_crc32h:
	case Intrinsic::aarch64_crc32ch:
	return tryCombineCRC32(0xffff, N, DAG);
	case Intrinsic::aarch64_sve_smaxv:
	return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG);
	case Intrinsic::aarch64_sve_umaxv:
	return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG);
	case Intrinsic::aarch64_sve_sminv:
	return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG);
	case Intrinsic::aarch64_sve_uminv:
	return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG);
	case Intrinsic::aarch64_sve_orv:
	return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG);
	case Intrinsic::aarch64_sve_eorv:
	return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
	case Intrinsic::aarch64_sve_andv:
	return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
	case Intrinsic::aarch64_sve_index:
	return LowerSVEIntrinsicIndex(N, DAG);
	case Intrinsic::aarch64_sve_dup:
	return LowerSVEIntrinsicDUP(N, DAG);
	case Intrinsic::aarch64_sve_dup_x:
	return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
	N->getOperand(1));
	case Intrinsic::aarch64_sve_ext:
	return LowerSVEIntrinsicEXT(N, DAG);
	case Intrinsic::aarch64_sve_smin:
	return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_umin:
	return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_smax:
	return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_umax:
	return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_lsl:
	return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_lsr:
	return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_asr:
	return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_cmphs:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
	break;
	case Intrinsic::aarch64_sve_cmphi:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
	break;
	case Intrinsic::aarch64_sve_cmpge:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETGE));
	break;
	case Intrinsic::aarch64_sve_cmpgt:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETGT));
	break;
	case Intrinsic::aarch64_sve_cmpeq:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
	break;
	case Intrinsic::aarch64_sve_cmpne:
	if (!N->getOperand(2).getValueType().isFloatingPoint())
	return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
	N->getValueType(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), DAG.getCondCode(ISD::SETNE));
	break;
	case Intrinsic::aarch64_sve_fadda:
	return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
	case Intrinsic::aarch64_sve_faddv:
	return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
	case Intrinsic::aarch64_sve_fmaxnmv:
	return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
	case Intrinsic::aarch64_sve_fmaxv:
	return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
	case Intrinsic::aarch64_sve_fminnmv:
	return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
	case Intrinsic::aarch64_sve_fminv:
	return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
	case Intrinsic::aarch64_sve_sel:
	return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2), N->getOperand(3));
	case Intrinsic::aarch64_sve_cmpeq_wide:
	return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpne_wide:
	return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpge_wide:
	return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpgt_wide:
	return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmplt_wide:
	return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmple_wide:
	return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmphs_wide:
	return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
	case Intrinsic::aarch64_sve_cmphi_wide:
	return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmplo_wide:
	return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
	case Intrinsic::aarch64_sve_cmpls_wide:
	return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
	case Intrinsic::aarch64_sve_ptest_any:
	return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
	AArch64CC::ANY_ACTIVE);
	case Intrinsic::aarch64_sve_ptest_first:
	return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
	AArch64CC::FIRST_ACTIVE);
	case Intrinsic::aarch64_sve_ptest_last:
	return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
	AArch64CC::LAST_ACTIVE);
	}
	return SDValue();
	}

	static SDValue performExtendCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
	// we can convert that DUP into another extract_high (of a bigger DUP), which
	// helps the backend to decide that an sabdl2 would be useful, saving a real
	// extract_high operation.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
	SDNode *ABDNode = N->getOperand(0).getNode();
	unsigned IID = getIntrinsicID(ABDNode);
	if (IID == Intrinsic::aarch64_neon_sabd \|\|
	IID == Intrinsic::aarch64_neon_uabd) {
	SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
	if (!NewABD.getNode())
	return SDValue();

	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
	NewABD);
	}
	}

	// This is effectively a custom type legalization for AArch64.
	//
	// Type legalization will split an extend of a small, legal, type to a larger
	// illegal type by first splitting the destination type, often creating
	// illegal source types, which then get legalized in isel-confusing ways,
	// leading to really terrible codegen. E.g.,
	// %result = v8i32 sext v8i8 %value
	// becomes
	// %losrc = extract_subreg %value, ...
	// %hisrc = extract_subreg %value, ...
	// %lo = v4i32 sext v4i8 %losrc
	// %hi = v4i32 sext v4i8 %hisrc
	// Things go rapidly downhill from there.
	//
	// For AArch64, the [sz]ext vector instructions can only go up one element
	// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
	// take two instructions.
	//
	// This implies that the most efficient way to do the extend from v8i8
	// to two v4i32 values is to first extend the v8i8 to v8i16, then do
	// the normal splitting to happen for the v8i16->v8i32.

	// This is pre-legalization to catch some cases where the default
	// type legalization will create ill-tempered code.
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// We're only interested in cleaning things up for non-legal vector types
	// here. If both the source and destination are legal, things will just
	// work naturally without any fiddling.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT ResVT = N->getValueType(0);
	if (!ResVT.isVector() \|\| TLI.isTypeLegal(ResVT))
	return SDValue();
	// If the vector type isn't a simple VT, it's beyond the scope of what
	// we're worried about here. Let legalization do its thing and hope for
	// the best.
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src->getValueType(0);
	if (!ResVT.isSimple() \|\| !SrcVT.isSimple())
	return SDValue();

	// If the source VT is a 64-bit fixed or scalable vector, we can play games
	// and get the better results we want.
	if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
	return SDValue();

	unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
	ElementCount SrcEC = SrcVT.getVectorElementCount();
	SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
	SDLoc DL(N);
	Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);

	// Now split the rest of the operation into two halves, each with a 64
	// bit source.
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());

	EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
	LoVT.getVectorElementCount());
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(0, DL, MVT::i64));
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);

	// Now combine the parts back together so we still have a single result
	// like the combiner expects.
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
	SDValue SplatVal, unsigned NumVecElts) {
	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
	unsigned OrigAlignment = St.getAlignment();
	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;

	// Create scalar stores. This is at least as good as the code sequence for a
	// split unaligned store which is a dup.s, ext.b, and two stores.
	// Most of the time the three stores should be replaced by store pair
	// instructions (stp).
	SDLoc DL(&St);
	SDValue BasePtr = St.getBasePtr();
	uint64_t BaseOffset = 0;

	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
	SDValue NewST1 =
	DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
	OrigAlignment, St.getMemOperand()->getFlags());

	// As this in ISel, we will not merge this add which may degrade results.
	if (BasePtr->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(BasePtr->getOperand(1))) {
	BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
	BasePtr = BasePtr->getOperand(0);
	}

	unsigned Offset = EltOffset;
	while (--NumVecElts) {
	unsigned Alignment = MinAlign(OrigAlignment, Offset);
	SDValue OffsetPtr =
	DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
	NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
	PtrInfo.getWithOffset(Offset), Alignment,
	St.getMemOperand()->getFlags());
	Offset += EltOffset;
	}
	return NewST1;
	}

	// Returns an SVE type that ContentTy can be trivially sign or zero extended
	// into.
	static MVT getSVEContainerType(EVT ContentTy) {
	assert(ContentTy.isSimple() && "No SVE containers for extended types");

	switch (ContentTy.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("No known SVE container for this MVT type");
	case MVT::nxv2i8:
	case MVT::nxv2i16:
	case MVT::nxv2i32:
	case MVT::nxv2i64:
	case MVT::nxv2f32:
	case MVT::nxv2f64:
	return MVT::nxv2i64;
	case MVT::nxv4i8:
	case MVT::nxv4i16:
	case MVT::nxv4i32:
	case MVT::nxv4f32:
	return MVT::nxv4i32;
	case MVT::nxv8i8:
	case MVT::nxv8i16:
	case MVT::nxv8f16:
	case MVT::nxv8bf16:
	return MVT::nxv8i16;
	case MVT::nxv16i8:
	return MVT::nxv16i8;
	}
	}

	static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
	return SDValue();

	EVT ContainerVT = VT;
	if (ContainerVT.isInteger())
	ContainerVT = getSVEContainerType(ContainerVT);

	SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
	SDValue Ops[] = { N->getOperand(0), // Chain
	N->getOperand(2), // Pg
	N->getOperand(3), // Base
	DAG.getValueType(VT) };

	SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
	SDValue LoadChain = SDValue(Load.getNode(), 1);

	if (ContainerVT.isInteger() && (VT != ContainerVT))
	Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));

	return DAG.getMergeValues({ Load, LoadChain }, DL);
	}

	static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	EVT PtrTy = N->getOperand(3).getValueType();

	if (VT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	EVT LoadVT = VT;
	if (VT.isFloatingPoint())
	LoadVT = VT.changeTypeToInteger();

	auto *MINode = cast<MemIntrinsicSDNode>(N);
	SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
	SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
	MINode->getOperand(3), DAG.getUNDEF(PtrTy),
	MINode->getOperand(2), PassThru,
	MINode->getMemoryVT(), MINode->getMemOperand(),
	ISD::UNINDEXED, ISD::NON_EXTLOAD, false);

	if (VT.isFloatingPoint()) {
	SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
	return DAG.getMergeValues(Ops, DL);
	}

	return L;
	}

	template <unsigned Opcode>
	static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
	static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO \|\|
	Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
	"Unsupported opcode.");
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	if (VT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	EVT LoadVT = VT;
	if (VT.isFloatingPoint())
	LoadVT = VT.changeTypeToInteger();

	SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
	SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
	SDValue LoadChain = SDValue(Load.getNode(), 1);

	if (VT.isFloatingPoint())
	Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));

	return DAG.getMergeValues({Load, LoadChain}, DL);
	}

	static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Data = N->getOperand(2);
	EVT DataVT = Data.getValueType();
	EVT HwSrcVt = getSVEContainerType(DataVT);
	SDValue InputVT = DAG.getValueType(DataVT);

	if (DataVT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	if (DataVT.isFloatingPoint())
	InputVT = DAG.getValueType(HwSrcVt);

	SDValue SrcNew;
	if (Data.getValueType().isFloatingPoint())
	SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
	else
	SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);

	SDValue Ops[] = { N->getOperand(0), // Chain
	SrcNew,
	N->getOperand(4), // Base
	N->getOperand(3), // Pg
	InputVT
	};

	return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
	}

	static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);

	SDValue Data = N->getOperand(2);
	EVT DataVT = Data.getValueType();
	EVT PtrTy = N->getOperand(4).getValueType();

	if (DataVT == MVT::nxv8bf16 &&
	!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
	return SDValue();

	if (DataVT.isFloatingPoint())
	Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);

	auto *MINode = cast<MemIntrinsicSDNode>(N);
	return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
	DAG.getUNDEF(PtrTy), MINode->getOperand(3),
	MINode->getMemoryVT(), MINode->getMemOperand(),
	ISD::UNINDEXED, false, false);
	}

	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
	/// load store optimizer pass will merge them to store pair stores. This should
	/// be better than a movi to create the vector zero followed by a vector store
	/// if the zero constant is not re-used, since one instructions and one register
	/// live range will be removed.
	///
	/// For example, the final generated code should be:
	///
	/// stp xzr, xzr, [x0]
	///
	/// instead of:
	///
	/// movi v0.2d, #0
	/// str q0, [x0]
	///
	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Avoid scalarizing zero splat stores for scalable vectors.
	if (VT.isScalableVector())
	return SDValue();

	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
	// 2, 3 or 4 i32 elements.
	int NumVecElts = VT.getVectorNumElements();
	if (!(((NumVecElts == 2 \|\| NumVecElts == 3) &&
	VT.getVectorElementType().getSizeInBits() == 64) \|\|
	((NumVecElts == 2 \|\| NumVecElts == 3 \|\| NumVecElts == 4) &&
	VT.getVectorElementType().getSizeInBits() == 32)))
	return SDValue();

	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// If the zero constant has more than one use then the vector store could be
	// better since the constant mov will be amortized and stp q instructions
	// should be able to be formed.
	if (!StVal.hasOneUse())
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// If the immediate offset of the address operand is too large for the stp
	// instruction, then bail out.
	if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
	int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
	if (Offset < -512 \|\| Offset > 504)
	return SDValue();
	}

	for (int I = 0; I < NumVecElts; ++I) {
	SDValue EltVal = StVal.getOperand(I);
	if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
	return SDValue();
	}

	// Use a CopyFromReg WZR/XZR here to prevent
	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
	SDLoc DL(&St);
	unsigned ZeroReg;
	EVT ZeroVT;
	if (VT.getVectorElementType().getSizeInBits() == 32) {
	ZeroReg = AArch64::WZR;
	ZeroVT = MVT::i32;
	} else {
	ZeroReg = AArch64::XZR;
	ZeroVT = MVT::i64;
	}
	SDValue SplatVal =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
	/// value. The load store optimizer pass will merge them to store pair stores.
	/// This has better performance than a splat of the scalar followed by a split
	/// vector store. Even if the stores are not merged it is four stores vs a dup,
	/// followed by an ext.b and two stores.
	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Don't replace floating point stores, they possibly won't be transformed to
	// stp because of the store pair suppress pass.
	if (VT.isFloatingPoint())
	return SDValue();

	// We can express a splat as store pair(s) for 2 or 4 elements.
	unsigned NumVecElts = VT.getVectorNumElements();
	if (NumVecElts != 4 && NumVecElts != 2)
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// Check that this is a splat.
	// Make sure that each of the relevant vector element locations are inserted
	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
	std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
	SDValue SplatVal;
	for (unsigned I = 0; I < NumVecElts; ++I) {
	// Check for insert vector elements.
	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	// Check that same value is inserted at each vector element.
	if (I == 0)
	SplatVal = StVal.getOperand(1);
	else if (StVal.getOperand(1) != SplatVal)
	return SDValue();

	// Check insert element index.
	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
	if (!CIndex)
	return SDValue();
	uint64_t IndexVal = CIndex->getZExtValue();
	if (IndexVal >= NumVecElts)
	return SDValue();
	IndexNotInserted.reset(IndexVal);

	StVal = StVal.getOperand(0);
	}
	// Check that all vector element locations were inserted to.
	if (IndexNotInserted.any())
	return SDValue();

	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {

	StoreSDNode *S = cast<StoreSDNode>(N);
	if (S->isVolatile() \|\| S->isIndexed())
	return SDValue();

	SDValue StVal = S->getValue();
	EVT VT = StVal.getValueType();

	if (!VT.isFixedLengthVector())
	return SDValue();

	// If we get a splat of zeros, convert this vector store to a store of
	// scalars. They will be merged into store pairs of xzr thereby removing one
	// instruction and one register.
	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
	return ReplacedZeroSplat;

	// FIXME: The logic for deciding if an unaligned store should be split should
	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
	// a call to that function here.

	if (!Subtarget->isMisaligned128StoreSlow())
	return SDValue();

	// Don't split at -Oz.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
	// those up regresses performance on micro-benchmarks and olden/bh.
	if (VT.getVectorNumElements() < 2 \|\| VT == MVT::v2i64)
	return SDValue();

	// Split unaligned 16B stores. They are terrible for performance.
	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
	// extensions can use this to mark that it does not want splitting to happen
	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
	if (VT.getSizeInBits() != 128 \|\| S->getAlignment() >= 16 \|\|
	S->getAlignment() <= 2)
	return SDValue();

	// If we get a splat of a scalar convert this vector store to a store of
	// scalars. They will be merged into store pairs thereby removing two
	// instructions.
	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
	return ReplacedSplat;

	SDLoc DL(S);

	// Split VT into two.
	EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
	unsigned NumElts = HalfVT.getVectorNumElements();
	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(0, DL, MVT::i64));
	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(NumElts, DL, MVT::i64));
	SDValue BasePtr = S->getBasePtr();
	SDValue NewST1 =
	DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
	S->getAlignment(), S->getMemOperand()->getFlags());
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(8, DL, MVT::i64));
	return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
	S->getPointerInfo(), S->getAlignment(),
	S->getMemOperand()->getFlags());
	}

	/// Target-specific DAG combine function for post-increment LD1 (lane) and
	/// post-increment LD1R.
	static SDValue performPostLD1Combine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	bool IsLaneOp) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (VT.isScalableVector())
	return SDValue();

	unsigned LoadIdx = IsLaneOp ? 1 : 0;
	SDNode *LD = N->getOperand(LoadIdx).getNode();
	// If it is not LOAD, can not do such combine.
	if (LD->getOpcode() != ISD::LOAD)
	return SDValue();

	// The vector lane must be a constant in the LD1LANE opcode.
	SDValue Lane;
	if (IsLaneOp) {
	Lane = N->getOperand(2);
	auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();
	}

	LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
	EVT MemVT = LoadSDN->getMemoryVT();
	// Check if memory operand is the same type as the vector element.
	if (MemVT != VT.getVectorElementType())
	return SDValue();

	// Check if there are other uses. If so, do not combine as it will introduce
	// an extra load.
	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
	++UI) {
	if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
	continue;
	if (*UI != N)
	return SDValue();
	}

	SDValue Addr = LD->getOperand(1);
	SDValue Vector = N->getOperand(0);
	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
	Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD
	\|\| UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = VT.getScalarSizeInBits() / 8;
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}

	// To avoid cycle construction make sure that neither the load nor the add
	// are predecessors to each other or the Vector.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(User);
	Worklist.push_back(LD);
	Worklist.push_back(Vector.getNode());
	if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LD->getOperand(0)); // Chain
	if (IsLaneOp) {
	Ops.push_back(Vector); // The vector to be inserted
	Ops.push_back(Lane); // The lane to be inserted in the vector
	}
	Ops.push_back(Addr);
	Ops.push_back(Inc);

	EVT Tys[3] = { VT, MVT::i64, MVT::Other };
	SDVTList SDTys = DAG.getVTList(Tys);
	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
	MemVT,
	LoadSDN->getMemOperand());

	// Update the uses.
	SDValue NewResults[] = {
	SDValue(LD, 0), // The result of load
	SDValue(UpdN.getNode(), 2) // Chain
	};
	DCI.CombineTo(LD, NewResults);
	DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
	DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

	break;
	}
	return SDValue();
	}

	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
	/// address translation.
	static bool performTBISimplification(SDValue Addr,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	APInt DemandedMask = APInt::getLowBitsSet(64, 56);
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	return true;
	}
	return false;
	}

	static SDValue performSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
	return Split;

	if (Subtarget->supportsAddressTopByteIgnored() &&
	performTBISimplification(N->getOperand(2), DCI, DAG))
	return SDValue(N, 0);

	return SDValue();
	}


	/// Target-specific DAG combine function for NEON load/store intrinsics
	/// to merge base address updates.
	static SDValue performNEONPostLDSTCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	unsigned AddrOpIdx = N->getNumOperands() - 1;
	SDValue Addr = N->getOperand(AddrOpIdx);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(N);
	Worklist.push_back(User);
	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	// Find the new opcode for the updating load/store.
	bool IsStore = false;
	bool IsLaneOp = false;
	bool IsDupOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
	NumVecs = 2; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
	NumVecs = 3; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
	NumVecs = 4; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
	NumVecs = 2; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
	NumVecs = 3; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
	NumVecs = 4; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
	NumVecs = 2; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
	NumVecs = 3; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
	NumVecs = 4; IsStore = true; IsLaneOp = true; break;
	}

	EVT VecTy;
	if (IsStore)
	VecTy = N->getOperand(2).getValueType();
	else
	VecTy = N->getValueType(0);

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (IsLaneOp \|\| IsDupOp)
	NumBytes /= VecTy.getVectorNumElements();
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // Incoming chain
	// Load lane and store have vector list as input.
	if (IsLaneOp \|\| IsStore)
	for (unsigned i = 2; i < AddrOpIdx; ++i)
	Ops.push_back(N->getOperand(i));
	Ops.push_back(Addr); // Base register
	Ops.push_back(Inc);

	// Return Types.
	EVT Tys[6];
	unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = VecTy;
	Tys[n++] = MVT::i64; // Type of write back register
	Tys[n] = MVT::Other; // Type of the chain
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));

	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
	MemInt->getMemoryVT(),
	MemInt->getMemOperand());

	// Update the uses.
	std::vector<SDValue> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i) {
	NewResults.push_back(SDValue(UpdN.getNode(), i));
	}
	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	// Checks to see if the value is the prescribed width and returns information
	// about its extension mode.
	static
	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
	ExtType = ISD::NON_EXTLOAD;
	switch(V.getNode()->getOpcode()) {
	default:
	return false;
	case ISD::LOAD: {
	LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
	if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
	ExtType = LoadNode->getExtensionType();
	return true;
	}
	return false;
	}
	case ISD::AssertSext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::SEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::AssertZext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::ZEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::Constant:
	case ISD::TargetConstant: {
	return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
	1LL << (width - 1);
	}
	}

	return true;
	}

	// This function does a whole lot of voodoo to determine if the tests are
	// equivalent without and with a mask. Essentially what happens is that given a
	// DAG resembling:
	//
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| \| \| \|
	// V V \| +----------+
	// +-------------+ +----+ \| \|
	// \| ADD \| \|0xff\| \| \|
	// +-------------+ +----+ \| \|
	// \| \| \| \|
	// V V \| \|
	// +-------------+ \| \|
	// \| AND \| \| \|
	// +-------------+ \| \|
	// \| \| \|
	// +-----+ \| \|
	// \| \| \|
	// V V V
	// +-------------+
	// \| CMP \|
	// +-------------+
	//
	// The AND node may be safely removed for some combinations of inputs. In
	// particular we need to take into account the extension type of the Input,
	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
	// width of the input (this can work for any width inputs, the above graph is
	// specific to 8 bits.
	//
	// The specific equations were worked out by generating output tables for each
	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
	// problem was simplified by working with 4 bit inputs, which means we only
	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
	// patterns present in both extensions (0,7). For every distinct set of
	// AddConstant and CompConstants bit patterns we can consider the masked and
	// unmasked versions to be equivalent if the result of this function is true for
	// all 16 distinct bit patterns of for the current extension type of Input (w0).
	//
	// sub w8, w0, w1
	// and w10, w8, #0x0f
	// cmp w8, w2
	// cset w9, AArch64CC
	// cmp w10, w2
	// cset w11, AArch64CC
	// cmp w9, w11
	// cset w0, eq
	// ret
	//
	// Since the above function shows when the outputs are equivalent it defines
	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
	// would be expensive to run during compiles. The equations below were written
	// in a test harness that confirmed they gave equivalent outputs to the above
	// for all inputs function, so they can be used determine if the removal is
	// legal instead.
	//
	// isEquivalentMaskless() is the code for testing if the AND can be removed
	// factored out of the DAG recognition as the DAG can take several forms.

	static bool isEquivalentMaskless(unsigned CC, unsigned width,
	ISD::LoadExtType ExtType, int AddConstant,
	int CompConstant) {
	// By being careful about our equations and only writing the in term
	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
	// make them generally applicable to all bit widths.
	int MaxUInt = (1 << width);

	// For the purposes of these comparisons sign extending the type is
	// equivalent to zero extending the add and displacing it by half the integer
	// width. Provided we are careful and make sure our equations are valid over
	// the whole range we can just adjust the input and avoid writing equations
	// for sign extended inputs.
	if (ExtType == ISD::SEXTLOAD)
	AddConstant -= (1 << (width-1));

	switch(CC) {
	case AArch64CC::LE:
	case AArch64CC::GT:
	if ((AddConstant == 0) \|\|
	(CompConstant == MaxUInt - 1 && AddConstant < 0) \|\|
	(AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::LT:
	case AArch64CC::GE:
	if ((AddConstant == 0) \|\|
	(AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::HI:
	case AArch64CC::LS:
	if ((AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant >= -1 &&
	CompConstant < AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::PL:
	case AArch64CC::MI:
	if ((AddConstant == 0) \|\|
	(AddConstant > 0 && CompConstant <= 0) \|\|
	(AddConstant < 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::LO:
	case AArch64CC::HS:
	if ((AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant >= 0 &&
	CompConstant <= AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::EQ:
	case AArch64CC::NE:
	if ((AddConstant > 0 && CompConstant < 0) \|\|
	(AddConstant < 0 && CompConstant >= 0 &&
	CompConstant < AddConstant + MaxUInt) \|\|
	(AddConstant >= 0 && CompConstant >= 0 &&
	CompConstant >= AddConstant) \|\|
	(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::VS:
	case AArch64CC::VC:
	case AArch64CC::AL:
	case AArch64CC::NV:
	return true;
	case AArch64CC::Invalid:
	break;
	}

	return false;
	}

	static
	SDValue performCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG, unsigned CCIndex,
	unsigned CmpIndex) {
	unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
	SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
	unsigned CondOpcode = SubsNode->getOpcode();

	if (CondOpcode != AArch64ISD::SUBS)
	return SDValue();

	// There is a SUBS feeding this condition. Is it fed by a mask we can
	// use?

	SDNode *AndNode = SubsNode->getOperand(0).getNode();
	unsigned MaskBits = 0;

	if (AndNode->getOpcode() != ISD::AND)
	return SDValue();

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
	uint32_t CNV = CN->getZExtValue();
	if (CNV == 255)
	MaskBits = 8;
	else if (CNV == 65535)
	MaskBits = 16;
	}

	if (!MaskBits)
	return SDValue();

	SDValue AddValue = AndNode->getOperand(0);

	if (AddValue.getOpcode() != ISD::ADD)
	return SDValue();

	// The basic dag structure is correct, grab the inputs and validate them.

	SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
	SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
	SDValue SubsInputValue = SubsNode->getOperand(1);

	// The mask is present and the provenance of all the values is a smaller type,
	// lets see if the mask is superfluous.

	if (!isa<ConstantSDNode>(AddInputValue2.getNode()) \|\|
	!isa<ConstantSDNode>(SubsInputValue.getNode()))
	return SDValue();

	ISD::LoadExtType ExtType;

	if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue2, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
	return SDValue();

	if(!isEquivalentMaskless(CC, MaskBits, ExtType,
	cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
	cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
	return SDValue();

	// The AND is not necessary, remove it.

	SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
	SubsNode->getValueType(1));
	SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };

	SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
	DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());

	return SDValue(N, 0);
	}

	// Optimize compare with zero and branch.
	static SDValue performBRCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
	return SDValue();

	if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
	N = NV.getNode();
	SDValue Chain = N->getOperand(0);
	SDValue Dest = N->getOperand(1);
	SDValue CCVal = N->getOperand(2);
	SDValue Cmp = N->getOperand(3);

	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
	unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
	return SDValue();

	unsigned CmpOpc = Cmp.getOpcode();
	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
	return SDValue();

	// Only attempt folding if there is only one use of the flag and no use of the
	// value.
	if (!Cmp->hasNUsesOfValue(0, 0) \|\| !Cmp->hasNUsesOfValue(1, 1))
	return SDValue();

	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);

	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected the value type to be the same for both operands!");
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return SDValue();

	if (isNullConstant(LHS))
	std::swap(LHS, RHS);

	if (!isNullConstant(RHS))
	return SDValue();

	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
	LHS.getOpcode() == ISD::SRL)
	return SDValue();

	// Fold the compare into the branch instruction.
	SDValue BR;
	if (CC == AArch64CC::EQ)
	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
	else
	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, BR, false);

	return SDValue();
	}

	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
	// as well as whether the test should be inverted. This code is required to
	// catch these cases (as opposed to standard dag combines) because
	// AArch64ISD::TBZ is matched during legalization.
	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
	SelectionDAG &DAG) {

	if (!Op->hasOneUse())
	return Op;

	// We don't handle undef/constant-fold cases below, as they should have
	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
	// etc.)

	// (tbz (trunc x), b) -> (tbz x, b)
	// This case is just here to enable more of the below cases to be caught.
	if (Op->getOpcode() == ISD::TRUNCATE &&
	Bit < Op->getValueType(0).getSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
	if (Op->getOpcode() == ISD::ANY_EXTEND &&
	Bit < Op->getOperand(0).getValueSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	if (Op->getNumOperands() != 2)
	return Op;

	auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!C)
	return Op;

	switch (Op->getOpcode()) {
	default:
	return Op;

	// (tbz (and x, m), b) -> (tbz x, b)
	case ISD::AND:
	if ((C->getZExtValue() >> Bit) & 1)
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	return Op;

	// (tbz (shl x, c), b) -> (tbz x, b-c)
	case ISD::SHL:
	if (C->getZExtValue() <= Bit &&
	(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit - C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
	case ISD::SRA:
	Bit = Bit + C->getZExtValue();
	if (Bit >= Op->getValueType(0).getSizeInBits())
	Bit = Op->getValueType(0).getSizeInBits() - 1;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

	// (tbz (srl x, c), b) -> (tbz x, b+c)
	case ISD::SRL:
	if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit + C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (xor x, -1), b) -> (tbnz x, b)
	case ISD::XOR:
	if ((C->getZExtValue() >> Bit) & 1)
	Invert = !Invert;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	}

	// Optimize test single bit zero/non-zero and branch.
	static SDValue performTBZCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	bool Invert = false;
	SDValue TestSrc = N->getOperand(1);
	SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);

	if (TestSrc == NewTestSrc)
	return SDValue();

	unsigned NewOpc = N->getOpcode();
	if (Invert) {
	if (NewOpc == AArch64ISD::TBZ)
	NewOpc = AArch64ISD::TBNZ;
	else {
	assert(NewOpc == AArch64ISD::TBNZ);
	NewOpc = AArch64ISD::TBZ;
	}
	}

	SDLoc DL(N);
	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
	}

	// vselect (v1i1 setcc) ->
	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
	// such VSELECT.
	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT CCVT = N0.getValueType();

	if (N0.getOpcode() != ISD::SETCC \|\| CCVT.getVectorNumElements() != 1 \|\|
	CCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	EVT ResVT = N->getValueType(0);
	EVT CmpVT = N0.getOperand(0).getValueType();
	// Only combine when the result type is of the same size as the compared
	// operands.
	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
	return SDValue();

	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);
	SDValue SetCC =
	DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
	N0.getOperand(0), N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
	IfTrue, IfFalse);
	}

	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
	/// the compare-mask instructions rather than going via NZCV, even if LHS and
	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
	/// with a vector one followed by a DUP shuffle on the result.
	static SDValue performSelectCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT ResVT = N->getValueType(0);

	if (N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
	// scalar SetCCResultType. We also don't expect vectors, because we assume
	// that selects fed by vector SETCCs are canonicalized to VSELECT.
	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
	"Scalar-SETCC feeding SELECT has unexpected result type!");

	// If NumMaskElts == 0, the comparison is larger than select result. The
	// largest real NEON comparison is 64-bits per lane, which means the result is
	// at most 32-bits and an illegal vector. Just bail out for now.
	EVT SrcVT = N0.getOperand(0).getValueType();

	// Don't try to do this optimization when the setcc itself has i1 operands.
	// There are no legal vectors of i1, so this would be pointless.
	if (SrcVT == MVT::i1)
	return SDValue();

	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
	if (!ResVT.isVector() \|\| NumMaskElts == 0)
	return SDValue();

	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

	// Also bail out if the vector CCVT isn't the same size as ResVT.
	// This can happen if the SETCC operand size doesn't divide the ResVT size
	// (e.g., f64 vs v3f32).
	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
	return SDValue();

	// Make sure we didn't create illegal types, if we're not supposed to.
	assert(DCI.isBeforeLegalize() \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));

	// First perform a vector comparison, where lane 0 is the one we're interested
	// in.
	SDLoc DL(N0);
	SDValue LHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
	SDValue RHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
	SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));

	// Now duplicate the comparison mask we want across all other lanes.
	SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
	SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
	Mask = DAG.getNode(ISD::BITCAST, DL,
	ResVT.changeVectorElementTypeToInteger(), Mask);

	return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
	}

	/// Get rid of unnecessary NVCASTs (that don't change the type).
	static SDValue performNVCASTCombine(SDNode *N) {
	if (N->getValueType(0) == N->getOperand(0).getValueType())
	return N->getOperand(0);

	return SDValue();
	}

	// If all users of the globaladdr are of the form (globaladdr + constant), find
	// the smallest constant, fold it into the globaladdr's offset and rewrite the
	// globaladdr as (globaladdr + constant) - constant.
	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget,
	const TargetMachine &TM) {
	auto *GN = cast<GlobalAddressSDNode>(N);
	if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
	AArch64II::MO_NO_FLAG)
	return SDValue();

	uint64_t MinOffset = -1ull;
	for (SDNode *N : GN->uses()) {
	if (N->getOpcode() != ISD::ADD)
	return SDValue();
	auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
	if (!C)
	C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	MinOffset = std::min(MinOffset, C->getZExtValue());
	}
	uint64_t Offset = MinOffset + GN->getOffset();

	// Require that the new offset is larger than the existing one. Otherwise, we
	// can end up oscillating between two possible DAGs, for example,
	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
	if (Offset <= uint64_t(GN->getOffset()))
	return SDValue();

	// Check whether folding this offset is legal. It must not go out of bounds of
	// the referenced object to avoid violating the code model, and must be
	// smaller than 2^21 because this is the largest offset expressible in all
	// object formats.
	//
	// This check also prevents us from folding negative offsets, which will end
	// up being treated in the same way as large positive ones. They could also
	// cause code model violations, and aren't really common enough to matter.
	if (Offset >= (1 << 21))
	return SDValue();

	const GlobalValue *GV = GN->getGlobal();
	Type *T = GV->getValueType();
	if (!T->isSized() \|\|
	Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
	return SDValue();

	SDLoc DL(GN);
	SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
	return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
	DAG.getConstant(MinOffset, DL, MVT::i64));
	}

	// Turns the vector of indices into a vector of byte offstes by scaling Offset
	// by (BitWidth / 8).
	static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
	SDLoc DL, unsigned BitWidth) {
	assert(Offset.getValueType().isScalableVector() &&
	"This method is only for scalable vectors of offsets");

	SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
	SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);

	return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
	}

	/// Check if the value of \p OffsetInBytes can be used as an immediate for
	/// the gather load/prefetch and scatter store instructions with vector base and
	/// immediate offset addressing mode:
	///
	/// [<Zn>.[S\|D]{, #<imm>}]
	///
	/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.

	inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
	unsigned ScalarSizeInBytes) {
	// The immediate is not a multiple of the scalar size.
	if (OffsetInBytes % ScalarSizeInBytes)
	return false;

	// The immediate is out of range.
	if (OffsetInBytes / ScalarSizeInBytes > 31)
	return false;

	return true;
	}

	/// Check if the value of \p Offset represents a valid immediate for the SVE
	/// gather load/prefetch and scatter store instructiona with vector base and
	/// immediate offset addressing mode:
	///
	/// [<Zn>.[S\|D]{, #<imm>}]
	///
	/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
	static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
	unsigned ScalarSizeInBytes) {
	ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
	return OffsetConst && isValidImmForSVEVecImmAddrMode(
	OffsetConst->getZExtValue(), ScalarSizeInBytes);
	}

	static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
	unsigned Opcode,
	bool OnlyPackedOffsets = true) {
	const SDValue Src = N->getOperand(2);
	const EVT SrcVT = Src->getValueType(0);
	assert(SrcVT.isScalableVector() &&
	"Scatter stores are only possible for SVE vectors");

	SDLoc DL(N);
	MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();

	// Make sure that source data will fit into an SVE register
	if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
	return SDValue();

	// For FPs, ACLE only supports _packed_ single and double precision types.
	if (SrcElVT.isFloatingPoint())
	if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
	return SDValue();

	// Depending on the addressing mode, this is either a pointer or a vector of
	// pointers (that fits into one register)
	SDValue Base = N->getOperand(4);
	// Depending on the addressing mode, this is either a single offset or a
	// vector of offsets (that fits into one register)
	SDValue Offset = N->getOperand(5);

	// For "scalar + vector of indices", just scale the indices. This only
	// applies to non-temporal scatters because there's no instruction that takes
	// indicies.
	if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
	Offset =
	getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
	Opcode = AArch64ISD::SSTNT1_PRED;
	}

	// In the case of non-temporal gather loads there's only one SVE instruction
	// per data-size: "scalar + vector", i.e.
	// * stnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
	// Since we do have intrinsics that allow the arguments to be in a different
	// order, we may need to swap them to match the spec.
	if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
	std::swap(Base, Offset);

	// SST1_IMM requires that the offset is an immediate that is:
	// * a multiple of #SizeInBytes,
	// * in the range [0, 31 x #SizeInBytes],
	// where #SizeInBytes is the size in bytes of the stored items. For
	// immediates outside that range and non-immediate scalar offsets use SST1 or
	// SST1_UXTW instead.
	if (Opcode == AArch64ISD::SST1_IMM_PRED) {
	if (!isValidImmForSVEVecImmAddrMode(Offset,
	SrcVT.getScalarSizeInBits() / 8)) {
	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
	Opcode = AArch64ISD::SST1_UXTW_PRED;
	else
	Opcode = AArch64ISD::SST1_PRED;

	std::swap(Base, Offset);
	}
	}

	auto &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(Base.getValueType()))
	return SDValue();

	// Some scatter store variants allow unpacked offsets, but only as nxv2i32
	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
	// nxv2i64. Legalize accordingly.
	if (!OnlyPackedOffsets &&
	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

	if (!TLI.isTypeLegal(Offset.getValueType()))
	return SDValue();

	// Source value type that is representable in hardware
	EVT HwSrcVt = getSVEContainerType(SrcVT);

	// Keep the original type of the input data to store - this is needed to be
	// able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
	// FP values we want the integer equivalent, so just use HwSrcVt.
	SDValue InputVT = DAG.getValueType(SrcVT);
	if (SrcVT.isFloatingPoint())
	InputVT = DAG.getValueType(HwSrcVt);

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue SrcNew;

	if (Src.getValueType().isFloatingPoint())
	SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
	else
	SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);

	SDValue Ops[] = {N->getOperand(0), // Chain
	SrcNew,
	N->getOperand(3), // Pg
	Base,
	Offset,
	InputVT};

	return DAG.getNode(Opcode, DL, VTs, Ops);
	}

	static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
	unsigned Opcode,
	bool OnlyPackedOffsets = true) {
	const EVT RetVT = N->getValueType(0);
	assert(RetVT.isScalableVector() &&
	"Gather loads are only possible for SVE vectors");

	SDLoc DL(N);

	// Make sure that the loaded data will fit into an SVE register
	if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
	return SDValue();

	// Depending on the addressing mode, this is either a pointer or a vector of
	// pointers (that fits into one register)
	SDValue Base = N->getOperand(3);
	// Depending on the addressing mode, this is either a single offset or a
	// vector of offsets (that fits into one register)
	SDValue Offset = N->getOperand(4);

	// For "scalar + vector of indices", just scale the indices. This only
	// applies to non-temporal gathers because there's no instruction that takes
	// indicies.
	if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
	RetVT.getScalarSizeInBits());
	Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
	}

	// In the case of non-temporal gather loads there's only one SVE instruction
	// per data-size: "scalar + vector", i.e.
	// * ldnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
	// Since we do have intrinsics that allow the arguments to be in a different
	// order, we may need to swap them to match the spec.
	if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
	Offset.getValueType().isVector())
	std::swap(Base, Offset);

	// GLD{FF}1_IMM requires that the offset is an immediate that is:
	// * a multiple of #SizeInBytes,
	// * in the range [0, 31 x #SizeInBytes],
	// where #SizeInBytes is the size in bytes of the loaded items. For
	// immediates outside that range and non-immediate scalar offsets use
	// GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
	if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO \|\|
	Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
	if (!isValidImmForSVEVecImmAddrMode(Offset,
	RetVT.getScalarSizeInBits() / 8)) {
	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
	? AArch64ISD::GLD1_UXTW_MERGE_ZERO
	: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
	else
	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
	? AArch64ISD::GLD1_MERGE_ZERO
	: AArch64ISD::GLDFF1_MERGE_ZERO;

	std::swap(Base, Offset);
	}
	}

	auto &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(Base.getValueType()))
	return SDValue();

	// Some gather load variants allow unpacked offsets, but only as nxv2i32
	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
	// nxv2i64. Legalize accordingly.
	if (!OnlyPackedOffsets &&
	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);

	// Return value type that is representable in hardware
	EVT HwRetVt = getSVEContainerType(RetVT);

	// Keep the original output value type around - this is needed to be able to
	// select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
	// values we want the integer equivalent, so just use HwRetVT.
	SDValue OutVT = DAG.getValueType(RetVT);
	if (RetVT.isFloatingPoint())
	OutVT = DAG.getValueType(HwRetVt);

	SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
	SDValue Ops[] = {N->getOperand(0), // Chain
	N->getOperand(2), // Pg
	Base, Offset, OutVT};

	SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
	SDValue LoadChain = SDValue(Load.getNode(), 1);

	if (RetVT.isInteger() && (RetVT != HwRetVt))
	Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));

	// If the original return value was FP, bitcast accordingly. Doing it here
	// means that we can avoid adding TableGen patterns for FPs.
	if (RetVT.isFloatingPoint())
	Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));

	return DAG.getMergeValues({Load, LoadChain}, DL);
	}

	static SDValue
	performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue Src = N->getOperand(0);
	unsigned Opc = Src->getOpcode();

	// Sign extend of an unsigned unpack -> signed unpack
	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {

	unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
	: AArch64ISD::SUNPKLO;

	// Push the sign extend to the operand of the unpack
	// This is necessary where, for example, the operand of the unpack
	// is another unpack:
	// 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
	// ->
	// 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
	// ->
	// 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
	SDValue ExtOp = Src->getOperand(0);
	auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
	EVT EltTy = VT.getVectorElementType();
	(void)EltTy;

	assert((EltTy == MVT::i8 \|\| EltTy == MVT::i16 \|\| EltTy == MVT::i32) &&
	"Sign extending from an invalid type");

	EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	VT.getVectorElementCount() * 2);

	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
	ExtOp, DAG.getValueType(ExtVT));

	return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
	}

	// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
	// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
	unsigned NewOpc;
	unsigned MemVTOpNum = 4;
	switch (Opc) {
	case AArch64ISD::LD1_MERGE_ZERO:
	NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
	MemVTOpNum = 3;
	break;
	case AArch64ISD::LDNF1_MERGE_ZERO:
	NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
	MemVTOpNum = 3;
	break;
	case AArch64ISD::LDFF1_MERGE_ZERO:
	NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
	MemVTOpNum = 3;
	break;
	case AArch64ISD::GLD1_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
	NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
	break;
	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
	break;
	case AArch64ISD::GLDNT1_MERGE_ZERO:
	NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
	break;
	default:
	return SDValue();
	}

	EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();

	if ((SignExtSrcVT != SrcMemVT) \|\| !Src.hasOneUse())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);

	SmallVector<SDValue, 5> Ops;
	for (unsigned I = 0; I < Src->getNumOperands(); ++I)
	Ops.push_back(Src->getOperand(I));

	SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
	DCI.CombineTo(N, ExtLoad);
	DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));

	// Return N so it doesn't get rechecked
	return SDValue(N, 0);
	}

	/// Legalize the gather prefetch (scalar + vector addressing mode) when the
	/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
	/// != nxv2i32) do not need legalization.
	static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
	const unsigned OffsetPos = 4;
	SDValue Offset = N->getOperand(OffsetPos);

	// Not an unpacked vector, bail out.
	if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
	return SDValue();

	// Extend the unpacked offset vector to 64-bit lanes.
	SDLoc DL(N);
	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
	SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
	// Replace the offset operand with the 64-bit one.
	Ops[OffsetPos] = Offset;

	return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
	}

	/// Combines a node carrying the intrinsic
	/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
	/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
	/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
	/// sve gather prefetch instruction with vector plus immediate addressing mode.
	static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
	unsigned ScalarSizeInBytes) {
	const unsigned ImmPos = 4, OffsetPos = 3;
	// No need to combine the node if the immediate is valid...
	if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
	return SDValue();

	// ...otherwise swap the offset base with the offset...
	SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
	std::swap(Ops[ImmPos], Ops[OffsetPos]);
	// ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
	// `aarch64_sve_prfb_gather_uxtw_index`.
	SDLoc DL(N);
	Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
	MVT::i64);

	return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
	}

	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default:
	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
	break;
	case ISD::ADD:
	case ISD::SUB:
	return performAddSubLongCombine(N, DCI, DAG);
	case ISD::XOR:
	return performXorCombine(N, DAG, DCI, Subtarget);
	case ISD::MUL:
	return performMulCombine(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performIntToFpCombine(N, DAG, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return performFpToIntCombine(N, DAG, DCI, Subtarget);
	case ISD::FDIV:
	return performFDivCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DCI, Subtarget);
	case ISD::AND:
	return performANDCombine(N, DCI);
	case ISD::SRL:
	return performSRLCombine(N, DCI);
	case ISD::INTRINSIC_WO_CHAIN:
	return performIntrinsicCombine(N, DCI, Subtarget);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	return performExtendCombine(N, DCI, DAG);
	case ISD::SIGN_EXTEND_INREG:
	return performSignExtendInRegCombine(N, DCI, DAG);
	case ISD::CONCAT_VECTORS:
	return performConcatVectorsCombine(N, DCI, DAG);
	case ISD::SELECT:
	return performSelectCombine(N, DCI);
	case ISD::VSELECT:
	return performVSelectCombine(N, DCI.DAG);
	case ISD::LOAD:
	if (performTBISimplification(N->getOperand(1), DCI, DAG))
	return SDValue(N, 0);
	break;
	case ISD::STORE:
	return performSTORECombine(N, DCI, DAG, Subtarget);
	case AArch64ISD::BRCOND:
	return performBRCONDCombine(N, DCI, DAG);
	case AArch64ISD::TBNZ:
	case AArch64ISD::TBZ:
	return performTBZCombine(N, DCI, DAG);
	case AArch64ISD::CSEL:
	return performCONDCombine(N, DCI, DAG, 2, 3);
	case AArch64ISD::DUP:
	return performPostLD1Combine(N, DCI, false);
	case AArch64ISD::NVCAST:
	return performNVCASTCombine(N);
	case ISD::INSERT_VECTOR_ELT:
	return performPostLD1Combine(N, DCI, true);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
	return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /=ScalarSizeInBytes/);
	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
	return legalizeSVEGatherPrefetchOffsVec(N, DAG);
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r:
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane:
	return performNEONPostLDSTCombine(N, DCI, DAG);
	case Intrinsic::aarch64_sve_ldnt1:
	return performLDNT1Combine(N, DAG);
	case Intrinsic::aarch64_sve_ld1rq:
	return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
	case Intrinsic::aarch64_sve_ld1ro:
	return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnt1_gather:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnt1_gather_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ld1:
	return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldnf1:
	return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1:
	return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_st1:
	return performST1Combine(N, DAG);
	case Intrinsic::aarch64_sve_stnt1:
	return performSTNT1Combine(N, DAG);
	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
	case Intrinsic::aarch64_sve_stnt1_scatter:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
	case Intrinsic::aarch64_sve_stnt1_scatter_index:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
	case Intrinsic::aarch64_sve_ld1_gather:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ld1_gather_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLD1_SCALED_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1_gather:
	return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1_gather_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
	return performGatherLoadCombine(N, DAG,
	AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
	case Intrinsic::aarch64_sve_st1_scatter:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
	case Intrinsic::aarch64_sve_st1_scatter_index:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
	return performScatterStoreCombine(N, DAG,
	AArch64ISD::SST1_SXTW_SCALED_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
	return performScatterStoreCombine(N, DAG,
	AArch64ISD::SST1_UXTW_SCALED_PRED,
	/OnlyPackedOffsets=/false);
	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
	return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
	case Intrinsic::aarch64_sve_tuple_get: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);
	SDValue Src1 = N->getOperand(2);
	SDValue Idx = N->getOperand(3);

	uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
	EVT ResVT = N->getValueType(0);
	uint64_t NumLanes = ResVT.getVectorElementCount().Min;
	SDValue Val =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
	DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
	return DAG.getMergeValues({Val, Chain}, DL);
	}
	case Intrinsic::aarch64_sve_tuple_set: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);
	SDValue Tuple = N->getOperand(2);
	SDValue Idx = N->getOperand(3);
	SDValue Vec = N->getOperand(4);

	EVT TupleVT = Tuple.getValueType();
	uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;

	uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
	uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;

	if ((TupleLanes % NumLanes) != 0)
	report_fatal_error("invalid tuple vector!");

	uint64_t NumVecs = TupleLanes / NumLanes;

	SmallVector<SDValue, 4> Opnds;
	for (unsigned I = 0; I < NumVecs; ++I) {
	if (I == IdxConst)
	Opnds.push_back(Vec);
	else {
	Opnds.push_back(
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
	DAG.getConstant(I * NumLanes, DL, MVT::i32)));
	}
	}
	SDValue Concat =
	DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
	return DAG.getMergeValues({Concat, Chain}, DL);
	}
	case Intrinsic::aarch64_sve_tuple_create2:
	case Intrinsic::aarch64_sve_tuple_create3:
	case Intrinsic::aarch64_sve_tuple_create4: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);

	SmallVector<SDValue, 4> Opnds;
	for (unsigned I = 2; I < N->getNumOperands(); ++I)
	Opnds.push_back(N->getOperand(I));

	EVT VT = Opnds[0].getValueType();
	EVT EltVT = VT.getVectorElementType();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	VT.getVectorElementCount() *
	(N->getNumOperands() - 2));
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
	return DAG.getMergeValues({Concat, Chain}, DL);
	}
	case Intrinsic::aarch64_sve_ld2:
	case Intrinsic::aarch64_sve_ld3:
	case Intrinsic::aarch64_sve_ld4: {
	SDLoc DL(N);
	SDValue Chain = N->getOperand(0);
	SDValue Mask = N->getOperand(2);
	SDValue BasePtr = N->getOperand(3);
	SDValue LoadOps[] = {Chain, Mask, BasePtr};
	unsigned IntrinsicID =
	cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	SDValue Result =
	LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
	return DAG.getMergeValues({Result, Chain}, DL);
	}
	default:
	break;
	}
	break;
	case ISD::GlobalAddress:
	return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
	}
	return SDValue();
	}

	// Check if the return value is used as only a return value, as otherwise
	// we can't perform a tail-call. In particular, we need to check for
	// target ISD nodes that are returns and any other "odd" constructs
	// that the generic analysis code won't necessarily catch.
	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
	SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
	MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode *Node : Copy->uses()) {
	if (Node->getOpcode() != AArch64ISD::RET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	// Return whether the an instruction can potentially be optimized to a tail
	// call. This will cause the optimizers to attempt to move, or duplicate,
	// return instructions to help enable tail call optimizations for this
	// instruction.
	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	bool &IsInc,
	SelectionDAG &DAG) const {
	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
	return false;

	Base = Op->getOperand(0);
	// All of the indexed addressing mode instructions take a signed
	// 9 bit immediate offset.
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -(uint64_t)RHSC;
	if (!isInt<9>(RHSC))
	return false;
	IsInc = (Op->getOpcode() == ISD::ADD);
	Offset = Op->getOperand(1);
	return true;
	}
	return false;
	}

	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
	return false;
	AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	bool AArch64TargetLowering::getPostIndexedAddressParts(
	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
	return false;
	// Post-indexing updates the base, so it's not a valid transform
	// if that's not the same as the load's pointer.
	if (Ptr != Base)
	return false;
	AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	if (N->getValueType(0) != MVT::i16 \|\|
	(Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
	return;

	Op = SDValue(
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
	DAG.getUNDEF(MVT::i32), Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
	}

	static void ReplaceReductionResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG, unsigned InterOp,
	unsigned AcrossOp) {
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
	SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
	Results.push_back(SplitVal);
	}

	static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
	DAG.getNode(ISD::SRL, DL, MVT::i128, N,
	DAG.getConstant(64, DL, MVT::i64)));
	return std::make_pair(Lo, Hi);
	}

	void AArch64TargetLowering::ReplaceExtractSubVectorResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	// Common code will handle these just fine.
	if (!InVT.isScalableVector() \|\| !InVT.isInteger())
	return;

	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// The following checks bail if this is not a halving operation.

	ElementCount ResEC = VT.getVectorElementCount();

	if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
	return;

	auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!CIndex)
	return;

	unsigned Index = CIndex->getZExtValue();
	if ((Index != 0) && (Index != ResEC.Min))
	return;

	unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
	EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());

	SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
	}

	// Create an even/odd pair of X registers holding integer value V.
	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
	SDLoc dl(V.getNode());
	SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
	SDValue VHi = DAG.getAnyExtOrTrunc(
	DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
	dl, MVT::i64);
	if (DAG.getDataLayout().isBigEndian())
	std::swap (VLo, VHi);
	SDValue RegClass =
	DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
	SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
	SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
	return SDValue(
	DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
	}

	static void ReplaceCMP_SWAP_128Results(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	assert(N->getValueType(0) == MVT::i128 &&
	"AtomicCmpSwap on types less than 128 should be legal");

	if (Subtarget->hasLSE()) {
	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
	SDValue Ops[] = {
	createGPRPairNode(DAG, N->getOperand(2)), // Compare value
	createGPRPairNode(DAG, N->getOperand(3)), // Store value
	N->getOperand(1), // Ptr
	N->getOperand(0), // Chain in
	};

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();

	unsigned Opcode;
	switch (MemOp->getOrdering()) {
	case AtomicOrdering::Monotonic:
	Opcode = AArch64::CASPX;
	break;
	case AtomicOrdering::Acquire:
	Opcode = AArch64::CASPAX;
	break;
	case AtomicOrdering::Release:
	Opcode = AArch64::CASPLX;
	break;
	case AtomicOrdering::AcquireRelease:
	case AtomicOrdering::SequentiallyConsistent:
	Opcode = AArch64::CASPALX;
	break;
	default:
	llvm_unreachable("Unexpected ordering!");
	}

	MachineSDNode *CmpSwap = DAG.getMachineNode(
	Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
	DAG.setNodeMemRefs(CmpSwap, {MemOp});

	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
	if (DAG.getDataLayout().isBigEndian())
	std::swap(SubReg1, SubReg2);
	SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0));
	SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0));
	Results.push_back(
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
	Results.push_back(SDValue(CmpSwap, 1)); // Chain out
	return;
	}

	auto Desired = splitInt128(N->getOperand(2), DAG);
	auto New = splitInt128(N->getOperand(3), DAG);
	SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
	New.first, New.second, N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	AArch64::CMP_SWAP_128, SDLoc(N),
	DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
	DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
	SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
	Results.push_back(SDValue(CmpSwap, 3));
	}

	void AArch64TargetLowering::ReplaceNodeResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this");
	case ISD::BITCAST:
	ReplaceBITCASTResults(N, Results, DAG);
	return;
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
	return;

	case ISD::CTPOP:
	Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
	return;
	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;
	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
	return;
	case AArch64ISD::SMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
	return;
	case AArch64ISD::UMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
	return;
	case AArch64ISD::SMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
	return;
	case AArch64ISD::UMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
	return;
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
	// Let normal code take care of it by not adding anything to Results.
	return;
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
	return;
	case ISD::LOAD: {
	assert(SDValue(N, 0).getValueType() == MVT::i128 &&
	"unexpected load's value type");
	LoadSDNode *LoadNode = cast<LoadSDNode>(N);
	if (!LoadNode->isVolatile() \|\| LoadNode->getMemoryVT() != MVT::i128) {
	// Non-volatile loads are optimized later in AArch64's load/store
	// optimizer.
	return;
	}

	SDValue Result = DAG.getMemIntrinsicNode(
	AArch64ISD::LDP, SDLoc(N),
	DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
	{LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
	LoadNode->getMemOperand());

	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
	Result.getValue(0), Result.getValue(1));
	Results.append({Pair, Result.getValue(2) /* Chain */});
	return;
	}
	case ISD::EXTRACT_SUBVECTOR:
	ReplaceExtractSubVectorResults(N, Results, DAG);
	return;
	case ISD::INTRINSIC_WO_CHAIN: {
	EVT VT = N->getValueType(0);
	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
	"custom lowering for unexpected type");

	ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default:
	return;
	case Intrinsic::aarch64_sve_clasta_n: {
	SDLoc DL(N);
	auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
	auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
	N->getOperand(1), Op2, N->getOperand(3));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	case Intrinsic::aarch64_sve_clastb_n: {
	SDLoc DL(N);
	auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
	auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
	N->getOperand(1), Op2, N->getOperand(3));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	case Intrinsic::aarch64_sve_lasta: {
	SDLoc DL(N);
	auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
	N->getOperand(1), N->getOperand(2));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	case Intrinsic::aarch64_sve_lastb: {
	SDLoc DL(N);
	auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
	N->getOperand(1), N->getOperand(2));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
	return;
	}
	}
	}
	}
	}

	bool AArch64TargetLowering::useLoadStackGuardNode() const {
	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are three or more FDIVs.
	return 3;
	}

	TargetLoweringBase::LegalizeTypeAction
	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
	// v4i16, v2i32 instead of to promote.
	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
	VT == MVT::v1f32)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return Size == 128;
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size > 128) return AtomicExpansionKind::None;
	// Nand not supported in LSE.
	if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
	// Leave 128 bits to LLSC.
	return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
	}

	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {
	// If subtarget has LSE, leave cmpxchg intact for codegen.
	if (Subtarget->hasLSE())
	return AtomicExpansionKind::None;
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
	return AtomicExpansionKind::None;
	return AtomicExpansionKind::LLSC;
	}

	Value AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i64, i64} and we have to recombine them into a
	// single i128 here.
	if (ValTy->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

	Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
	Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);

	return Builder.CreateBitCast(Trunc, EltTy);
	}

	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
	}

	Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
	Value Val, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i128 intrinsics take two
	// parameters: "i64, i64". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
	Function *Stxr = Intrinsic::getDeclaration(M, Int);
	Type *Int64Ty = Type::getInt64Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
	Type *Tys[] = { Addr->getType() };
	Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
	Val = Builder.CreateBitCast(Val, IntValTy);

	return Builder.CreateCall(Stxr,
	{Builder.CreateZExtOrBitCast(
	Val, Stxr->getFunctionType()->getParamType(0)),
	Addr});
	}

	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	return Ty->isArrayTy();
	}

	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
	EVT) const {
	return false;
	}

	static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
	Offset),
	IRB.getInt8PtrTy()->getPointerTo(0));
	}

	Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the stack cookie. See the definition
	// of TLS_SLOT_STACK_GUARD in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x28);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x10);

	return TargetLowering::getIRStackGuard(IRB);
	}

	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::Win64);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x48);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x8);

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
	// may be beneficial to sink in other cases, but we would have to check that
	// the cmp would not get folded into the br to form a cbz for these to be
	// beneficial.
	ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
	if (!Mask)
	return false;
	return Mask->getValue().isPowerOf2();
	}

	bool AArch64TargetLowering::
	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
	SelectionDAG &DAG) const {
	// Does baseline recommend not to perform the fold by default?
	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
	return false;
	// Else, if this is a vector shift, prefer 'shl'.
	return X.getValueType().isScalarInteger() \|\| NewShiftOpcode == ISD::SHL;
	}

	bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	SDNode *N) const {
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
	return false;
	return true;
	}

	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in AArch64unctionInfo.
	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void AArch64TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AArch64::GPR64RegClass.contains(*I))
	RC = &AArch64::GPR64RegClass;
	else if (AArch64::FPR64RegClass.contains(*I))
	RC = &AArch64::FPR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	Register NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on AArch64 is expensive. However, when aggressively
	// optimizing for code size, we prefer to use a div instruction, as it is
	// usually smaller than the alternative sequence.
	// The exception to this is vector division. Since AArch64 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	// We want inc-of-add for scalars and sub-of-not for vectors.
	return VT.isScalarInteger();
	}

	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
	}

	unsigned
	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
	return getPointerTy(DL).getSizeInBits();

	return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
	}

	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
	MF.getFrameInfo().computeMaxCallFrameSize(MF);
	TargetLoweringBase::finalizeLowering(MF);
	}

	// Unlike X86, we let frame lowering assign offsets to all catch objects.
	bool AArch64TargetLowering::needsFixedCatchObjects() const {
	return false;
	}

	bool AArch64TargetLowering::shouldLocalize(
	const MachineInstr &MI, const TargetTransformInfo *TTI) const {
	switch (MI.getOpcode()) {
	case TargetOpcode::G_GLOBAL_VALUE: {
	// On Darwin, TLS global vars get selected into function calls, which
	// we don't want localized, as they can get moved into the middle of a
	// another call sequence.
	const GlobalValue &GV = *MI.getOperand(1).getGlobal();
	if (GV.isThreadLocal() && Subtarget->isTargetMachO())
	return false;
	break;
	}
	// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
	// localizable.
	case AArch64::ADRP:
	case AArch64::G_ADD_LOW:
	return true;
	default:
	break;
	}
	return TargetLoweringBase::shouldLocalize(MI, TTI);
	}

	bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
	if (isa<ScalableVectorType>(Inst.getType()))
	return true;

	for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
	if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
	return true;

	if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
	if (isa<ScalableVectorType>(AI->getAllocatedType()))
	return true;
	}

	return false;
	}

	// Return the largest legal scalable vector type that matches VT's element type.
	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
	assert(VT.isFixedLengthVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal fixed length vector!");
	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("unexpected element type for SVE container");
	case MVT::i8:
	return EVT(MVT::nxv16i8);
	case MVT::i16:
	return EVT(MVT::nxv8i16);
	case MVT::i32:
	return EVT(MVT::nxv4i32);
	case MVT::i64:
	return EVT(MVT::nxv2i64);
	case MVT::f16:
	return EVT(MVT::nxv8f16);
	case MVT::f32:
	return EVT(MVT::nxv4f32);
	case MVT::f64:
	return EVT(MVT::nxv2f64);
	}
	}

	// Return a PTRUE with active lanes corresponding to the extent of VT.
	static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
	EVT VT) {
	assert(VT.isFixedLengthVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal fixed length vector!");

	int PgPattern;
	switch (VT.getVectorNumElements()) {
	default:
	llvm_unreachable("unexpected element count for SVE predicate");
	case 1:
	PgPattern = AArch64SVEPredPattern::vl1;
	break;
	case 2:
	PgPattern = AArch64SVEPredPattern::vl2;
	break;
	case 4:
	PgPattern = AArch64SVEPredPattern::vl4;
	break;
	case 8:
	PgPattern = AArch64SVEPredPattern::vl8;
	break;
	case 16:
	PgPattern = AArch64SVEPredPattern::vl16;
	break;
	case 32:
	PgPattern = AArch64SVEPredPattern::vl32;
	break;
	case 64:
	PgPattern = AArch64SVEPredPattern::vl64;
	break;
	case 128:
	PgPattern = AArch64SVEPredPattern::vl128;
	break;
	case 256:
	PgPattern = AArch64SVEPredPattern::vl256;
	break;
	}

	// TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
	// use AArch64SVEPredPattern::all, which can enable the use of unpredicated
	// variants of instructions when available.

	MVT MaskVT;
	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("unexpected element type for SVE predicate");
	case MVT::i8:
	MaskVT = MVT::nxv16i1;
	break;
	case MVT::i16:
	case MVT::f16:
	MaskVT = MVT::nxv8i1;
	break;
	case MVT::i32:
	case MVT::f32:
	MaskVT = MVT::nxv4i1;
	break;
	case MVT::i64:
	case MVT::f64:
	MaskVT = MVT::nxv2i1;
	break;
	}

	return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
	DAG.getTargetConstant(PgPattern, DL, MVT::i64));
	}

	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
	EVT VT) {
	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	"Expected legal scalable vector!");
	auto PredTy = VT.changeVectorElementType(MVT::i1);
	return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
	}

	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
	if (VT.isFixedLengthVector())
	return getPredicateForFixedLengthVector(DAG, DL, VT);

	return getPredicateForScalableVector(DAG, DL, VT);
	}

	// Grow V to consume an entire SVE register.
	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
	assert(VT.isScalableVector() &&
	"Expected to convert into a scalable vector!");
	assert(V.getValueType().isFixedLengthVector() &&
	"Expected a fixed length vector operand!");
	SDLoc DL(V);
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
	}

	// Shrink V so it's just big enough to maintain a VT's worth of data.
	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
	assert(VT.isFixedLengthVector() &&
	"Expected to convert into a fixed length vector!");
	assert(V.getValueType().isScalableVector() &&
	"Expected a scalable vector operand!");
	SDLoc DL(V);
	SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
	}

	// Convert all fixed length vector loads larger than NEON to masked_loads.
	SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
	SDValue Op, SelectionDAG &DAG) const {
	auto Load = cast<LoadSDNode>(Op);

	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

	auto NewLoad = DAG.getMaskedLoad(
	ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
	getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
	Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
	Load->getExtensionType());

	auto Result = convertFromScalableVector(DAG, VT, NewLoad);
	SDValue MergedValues[2] = {Result, Load->getChain()};
	return DAG.getMergeValues(MergedValues, DL);
	}

	// Convert all fixed length vector stores larger than NEON to masked_stores.
	SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
	SDValue Op, SelectionDAG &DAG) const {
	auto Store = cast<StoreSDNode>(Op);

	SDLoc DL(Op);
	EVT VT = Store->getValue().getValueType();
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

	auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
	return DAG.getMaskedStore(
	Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
	getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
	Store->getMemOperand(), Store->getAddressingMode(),
	Store->isTruncatingStore());
	}

	SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
	SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");

	SDLoc DL(Op);
	SDValue Val = Op.getOperand(0);
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
	Val = convertToScalableVector(DAG, ContainerVT, Val);

	// Repeatedly truncate Val until the result is of the desired element type.
	switch (ContainerVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("unimplemented container type");
	case MVT::nxv2i64:
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
	if (VT.getVectorElementType() == MVT::i32)
	break;
	LLVM_FALLTHROUGH;
	case MVT::nxv4i32:
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
	if (VT.getVectorElementType() == MVT::i16)
	break;
	LLVM_FALLTHROUGH;
	case MVT::nxv8i16:
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
	assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
	break;
	}

	return convertFromScalableVector(DAG, VT, Val);
	}

	SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
	SelectionDAG &DAG,
	unsigned NewOp) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	auto Pg = getPredicateForVector(DAG, DL, VT);

	if (useSVEForFixedLengthVectorVT(VT)) {
	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

	// Create list of operands by convereting existing ones to scalable types.
	SmallVector<SDValue, 4> Operands = {Pg};
	for (const SDValue &V : Op->op_values()) {
	if (isa<CondCodeSDNode>(V)) {
	Operands.push_back(V);
	continue;
	}

	assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
	"Only fixed length vectors are supported!");
	Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
	}

	auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
	return convertFromScalableVector(DAG, VT, ScalableRes);
	}

	assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");

	SmallVector<SDValue, 4> Operands = {Pg};
	for (const SDValue &V : Op->op_values()) {
	assert((isa<CondCodeSDNode>(V) \|\| V.getValueType().isScalableVector()) &&
	"Only scalable vectors are supported!");
	Operands.push_back(V);
	}

	return DAG.getNode(NewOp, DL, VT, Operands);
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
	index 83a488afc797..3e9c8c7b6df2 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
	@@ -1,705 +1,728 @@
	//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of the TargetRegisterInfo
	// class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64RegisterInfo.h"
	#include "AArch64FrameLowering.h"
	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64StackOffset.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetOptions.h"

	using namespace llvm;

	#define GET_REGINFO_TARGET_DESC
	#include "AArch64GenRegisterInfo.inc"

	AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
	: AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
	AArch64_MC::initLLVMToCVRegMapping(this);
	}

	-static bool hasSVEArgsOrReturn(const MachineFunction *MF) {
	+/// Return whether the register needs a CFI entry. Not all unwinders may know
	+/// about SVE registers, so we assume the lowest common denominator, i.e. the
	+/// callee-saves required by the base ABI. For the SVE registers z8-z15 only the
	+/// lower 64-bits (d8-d15) need to be saved. The lower 64-bits subreg is
	+/// returned in \p RegToUseForCFI.
	+bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg,
	+ unsigned &RegToUseForCFI) const {
	+ if (AArch64::PPRRegClass.contains(Reg))
	+ return false;
	+
	+ if (AArch64::ZPRRegClass.contains(Reg)) {
	+ RegToUseForCFI = getSubReg(Reg, AArch64::dsub);
	+ for (int I = 0; CSR_AArch64_AAPCS_SaveList[I]; ++I) {
	+ if (CSR_AArch64_AAPCS_SaveList[I] == RegToUseForCFI)
	+ return true;
	+ }
	+ return false;
	+ }
	+
	+ RegToUseForCFI = Reg;
	+ return true;
	+}
	+
	+bool AArch64RegisterInfo::hasSVEArgsOrReturn(const MachineFunction *MF) {
	const Function &F = MF->getFunction();
	return isa<ScalableVectorType>(F.getReturnType()) \|\|
	any_of(F.args(), [](const Argument &Arg) {
	return isa<ScalableVectorType>(Arg.getType());
	});
	}

	const MCPhysReg *
	AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
	assert(MF && "Invalid MachineFunction pointer.");

	if (MF->getFunction().getCallingConv() == CallingConv::GHC)
	// GHC set of callee saved regs is empty as all those regs are
	// used for passing STG regs around
	return CSR_AArch64_NoRegs_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
	return CSR_AArch64_AllRegs_SaveList;

	// Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save
	// lists depending on that will need to have their Darwin variant as well.
	if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
	return getDarwinCalleeSavedRegs(MF);

	if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
	return CSR_Win_AArch64_CFGuard_Check_SaveList;
	if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
	return CSR_Win_AArch64_AAPCS_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
	return CSR_AArch64_AAVPCS_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
	return CSR_AArch64_SVE_AAPCS_SaveList;
	if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
	->supportSwiftError() &&
	MF->getFunction().getAttributes().hasAttrSomewhere(
	Attribute::SwiftError))
	return CSR_AArch64_AAPCS_SwiftError_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
	return CSR_AArch64_RT_MostRegs_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::Win64)
	// This is for OSes other than Windows; Windows is a separate case further
	// above.
	return CSR_AArch64_AAPCS_X18_SaveList;
	if (hasSVEArgsOrReturn(MF))
	return CSR_AArch64_SVE_AAPCS_SaveList;
	return CSR_AArch64_AAPCS_SaveList;
	}

	const MCPhysReg *
	AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
	assert(MF && "Invalid MachineFunction pointer.");
	assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
	"Invalid subtarget for getDarwinCalleeSavedRegs");

	if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
	report_fatal_error(
	"Calling convention CFGuard_Check is unsupported on Darwin.");
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
	return CSR_Darwin_AArch64_AAVPCS_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
	report_fatal_error(
	"Calling convention SVE_VectorCall is unsupported on Darwin.");
	if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
	return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()
	? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList
	: CSR_Darwin_AArch64_CXX_TLS_SaveList;
	if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
	->supportSwiftError() &&
	MF->getFunction().getAttributes().hasAttrSomewhere(
	Attribute::SwiftError))
	return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList;
	if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
	return CSR_Darwin_AArch64_RT_MostRegs_SaveList;
	return CSR_Darwin_AArch64_AAPCS_SaveList;
	}

	const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
	const MachineFunction *MF) const {
	assert(MF && "Invalid MachineFunction pointer.");
	if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
	return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList;
	return nullptr;
	}

	void AArch64RegisterInfo::UpdateCustomCalleeSavedRegs(
	MachineFunction &MF) const {
	const MCPhysReg *CSRs = getCalleeSavedRegs(&MF);
	SmallVector<MCPhysReg, 32> UpdatedCSRs;
	for (const MCPhysReg I = CSRs; I; ++I)
	UpdatedCSRs.push_back(*I);

	for (size_t i = 0; i < AArch64::GPR64commonRegClass.getNumRegs(); ++i) {
	if (MF.getSubtarget<AArch64Subtarget>().isXRegCustomCalleeSaved(i)) {
	UpdatedCSRs.push_back(AArch64::GPR64commonRegClass.getRegister(i));
	}
	}
	// Register lists are zero-terminated.
	UpdatedCSRs.push_back(0);
	MF.getRegInfo().setCalleeSavedRegs(UpdatedCSRs);
	}

	const TargetRegisterClass *
	AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
	unsigned Idx) const {
	// edge case for GPR/FPR register classes
	if (RC == &AArch64::GPR32allRegClass && Idx == AArch64::hsub)
	return &AArch64::FPR32RegClass;
	else if (RC == &AArch64::GPR64allRegClass && Idx == AArch64::hsub)
	return &AArch64::FPR64RegClass;

	// Forward to TableGen's default version.
	return AArch64GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
	}

	const uint32_t *
	AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
	CallingConv::ID CC) const {
	assert(MF.getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
	"Invalid subtarget for getDarwinCallPreservedMask");

	if (CC == CallingConv::CXX_FAST_TLS)
	return CSR_Darwin_AArch64_CXX_TLS_RegMask;
	if (CC == CallingConv::AArch64_VectorCall)
	return CSR_Darwin_AArch64_AAVPCS_RegMask;
	if (CC == CallingConv::AArch64_SVE_VectorCall)
	report_fatal_error(
	"Calling convention SVE_VectorCall is unsupported on Darwin.");
	if (CC == CallingConv::CFGuard_Check)
	report_fatal_error(
	"Calling convention CFGuard_Check is unsupported on Darwin.");
	if (MF.getSubtarget<AArch64Subtarget>()
	.getTargetLowering()
	->supportSwiftError() &&
	MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask;
	if (CC == CallingConv::PreserveMost)
	return CSR_Darwin_AArch64_RT_MostRegs_RegMask;
	return CSR_Darwin_AArch64_AAPCS_RegMask;
	}

	const uint32_t *
	AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
	CallingConv::ID CC) const {
	bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
	if (CC == CallingConv::GHC)
	// This is academic because all GHC calls are (supposed to be) tail calls
	return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
	if (CC == CallingConv::AnyReg)
	return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;

	// All the following calling conventions are handled differently on Darwin.
	if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
	if (SCS)
	report_fatal_error("ShadowCallStack attribute not supported on Darwin.");
	return getDarwinCallPreservedMask(MF, CC);
	}

	if (CC == CallingConv::AArch64_VectorCall)
	return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
	if (CC == CallingConv::AArch64_SVE_VectorCall)
	return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask
	: CSR_AArch64_SVE_AAPCS_RegMask;
	if (CC == CallingConv::CFGuard_Check)
	return CSR_Win_AArch64_CFGuard_Check_RegMask;
	if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
	->supportSwiftError() &&
	MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask
	: CSR_AArch64_AAPCS_SwiftError_RegMask;
	if (CC == CallingConv::PreserveMost)
	return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask
	: CSR_AArch64_RT_MostRegs_RegMask;
	else
	return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
	}

	const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
	if (TT.isOSDarwin())
	return CSR_Darwin_AArch64_TLS_RegMask;

	assert(TT.isOSBinFormatELF() && "Invalid target");
	return CSR_AArch64_TLS_ELF_RegMask;
	}

	void AArch64RegisterInfo::UpdateCustomCallPreservedMask(MachineFunction &MF,
	const uint32_t **Mask) const {
	uint32_t *UpdatedMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(getNumRegs());
	memcpy(UpdatedMask, Mask, sizeof(UpdatedMask[0]) RegMaskSize);

	for (size_t i = 0; i < AArch64::GPR64commonRegClass.getNumRegs(); ++i) {
	if (MF.getSubtarget<AArch64Subtarget>().isXRegCustomCalleeSaved(i)) {
	for (MCSubRegIterator SubReg(AArch64::GPR64commonRegClass.getRegister(i),
	this, true);
	SubReg.isValid(); ++SubReg) {
	// See TargetRegisterInfo::getCallPreservedMask for how to interpret the
	// register mask.
	UpdatedMask[SubReg / 32] \|= 1u << (SubReg % 32);
	}
	}
	}
	*Mask = UpdatedMask;
	}

	const uint32_t *AArch64RegisterInfo::getNoPreservedMask() const {
	return CSR_AArch64_NoRegs_RegMask;
	}

	const uint32_t *
	AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
	CallingConv::ID CC) const {
	// This should return a register mask that is the same as that returned by
	// getCallPreservedMask but that additionally preserves the register used for
	// the first i64 argument (which must also be the register used to return a
	// single i64 return value)
	//
	// In case that the calling convention does not use the same register for
	// both, the function should return NULL (does not currently apply)
	assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
	if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin())
	return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask;
	return CSR_AArch64_AAPCS_ThisReturn_RegMask;
	}

	const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const {
	return CSR_AArch64_StackProbe_Windows_RegMask;
	}

	BitVector
	AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
	const AArch64FrameLowering *TFI = getFrameLowering(MF);

	// FIXME: avoid re-calculating this every time.
	BitVector Reserved(getNumRegs());
	markSuperRegs(Reserved, AArch64::WSP);
	markSuperRegs(Reserved, AArch64::WZR);

	if (TFI->hasFP(MF) \|\| TT.isOSDarwin())
	markSuperRegs(Reserved, AArch64::W29);

	for (size_t i = 0; i < AArch64::GPR32commonRegClass.getNumRegs(); ++i) {
	if (MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(i))
	markSuperRegs(Reserved, AArch64::GPR32commonRegClass.getRegister(i));
	}

	if (hasBasePointer(MF))
	markSuperRegs(Reserved, AArch64::W19);

	// SLH uses register W16/X16 as the taint register.
	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
	markSuperRegs(Reserved, AArch64::W16);

	assert(checkAllSuperRegsMarked(Reserved));
	return Reserved;
	}

	bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
	MCRegister Reg) const {
	return getReservedRegs(MF)[Reg];
	}

	bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const {
	return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC),
	std::end(*AArch64::GPR64argRegClass.MC),
	[this, &MF](MCPhysReg r){return isReservedReg(MF, r);});
	}

	void AArch64RegisterInfo::emitReservedArgRegCallError(
	const MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	F.getContext().diagnose(DiagnosticInfoUnsupported{F, "AArch64 doesn't support"
	" function calls if any of the argument registers is reserved."});
	}

	bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
	MCRegister PhysReg) const {
	return !isReservedReg(MF, PhysReg);
	}

	bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
	return PhysReg == AArch64::WZR \|\| PhysReg == AArch64::XZR;
	}

	const TargetRegisterClass *
	AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
	unsigned Kind) const {
	return &AArch64::GPR64spRegClass;
	}

	const TargetRegisterClass *
	AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
	if (RC == &AArch64::CCRRegClass)
	return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
	return RC;
	}

	unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }

	bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	// In the presence of variable sized objects or funclets, if the fixed stack
	// size is large enough that referencing from the FP won't result in things
	// being in range relatively often, we can use a base pointer to allow access
	// from the other direction like the SP normally works.
	//
	// Furthermore, if both variable sized objects are present, and the
	// stack needs to be dynamically re-aligned, the base pointer is the only
	// reliable way to reference the locals.
	if (MFI.hasVarSizedObjects() \|\| MF.hasEHFunclets()) {
	if (needsStackRealignment(MF))
	return true;

	if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	// Frames that have variable sized objects and scalable SVE objects,
	// should always use a basepointer.
	if (!AFI->hasCalculatedStackSizeSVE() \|\| AFI->getStackSizeSVE())
	return true;
	}

	// Conservatively estimate whether the negative offset from the frame
	// pointer will be sufficient to reach. If a function has a smallish
	// frame, it's less likely to have lots of spills and callee saved
	// space, so it's all more likely to be within range of the frame pointer.
	// If it's wrong, we'll materialize the constant and still get to the
	// object; it's just suboptimal. Negative offsets use the unscaled
	// load/store instructions, which have a 9-bit signed immediate.
	return MFI.getLocalFrameSize() >= 256;
	}

	return false;
	}

	Register
	AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
	const AArch64FrameLowering *TFI = getFrameLowering(MF);
	return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
	}

	bool AArch64RegisterInfo::requiresRegisterScavenging(
	const MachineFunction &MF) const {
	return true;
	}

	bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
	const MachineFunction &MF) const {
	return true;
	}

	bool
	AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
	// This function indicates whether the emergency spillslot should be placed
	// close to the beginning of the stackframe (closer to FP) or the end
	// (closer to SP).
	//
	// The beginning works most reliably if we have a frame pointer.
	// In the presence of any non-constant space between FP and locals,
	// (e.g. in case of stack realignment or a scalable SVE area), it is
	// better to use SP or BP.
	const AArch64FrameLowering &TFI = *getFrameLowering(MF);
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() \|\|
	AFI->hasCalculatedStackSizeSVE()) &&
	"Expected SVE area to be calculated by this point");
	return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE();
	}

	bool AArch64RegisterInfo::requiresFrameIndexScavenging(
	const MachineFunction &MF) const {
	return true;
	}

	bool
	AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
	return true;
	return MFI.hasVarSizedObjects() \|\| MFI.isFrameAddressTaken();
	}

	/// needsFrameBaseReg - Returns true if the instruction's frame index
	/// reference would be better served by a base register other than FP
	/// or SP. Used by LocalStackFrameAllocation to determine which frame index
	/// references it should create new base registers for.
	bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
	int64_t Offset) const {
	for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
	assert(i < MI->getNumOperands() &&
	"Instr doesn't have FrameIndex operand!");

	// It's the load/store FI references that cause issues, as it can be difficult
	// to materialize the offset if it won't fit in the literal field. Estimate
	// based on the size of the local frame and some conservative assumptions
	// about the rest of the stack frame (note, this is pre-regalloc, so
	// we don't know everything for certain yet) whether this offset is likely
	// to be out of range of the immediate. Return true if so.

	// We only generate virtual base registers for loads and stores, so
	// return false for everything else.
	if (!MI->mayLoad() && !MI->mayStore())
	return false;

	// Without a virtual base register, if the function has variable sized
	// objects, all fixed-size local references will be via the frame pointer,
	// Approximate the offset and see if it's legal for the instruction.
	// Note that the incoming offset is based on the SP value at function entry,
	// so it'll be negative.
	MachineFunction &MF = *MI->getParent()->getParent();
	const AArch64FrameLowering *TFI = getFrameLowering(MF);
	MachineFrameInfo &MFI = MF.getFrameInfo();

	// Estimate an offset from the frame pointer.
	// Conservatively assume all GPR callee-saved registers get pushed.
	// FP, LR, X19-X28, D8-D15. 64-bits each.
	int64_t FPOffset = Offset - 16 * 20;
	// Estimate an offset from the stack pointer.
	// The incoming offset is relating to the SP at the start of the function,
	// but when we access the local it'll be relative to the SP after local
	// allocation, so adjust our SP-relative offset by that allocation size.
	Offset += MFI.getLocalFrameSize();
	// Assume that we'll have at least some spill slots allocated.
	// FIXME: This is a total SWAG number. We should run some statistics
	// and pick a real one.
	Offset += 128; // 128 bytes of spill slots

	// If there is a frame pointer, try using it.
	// The FP is only available if there is no dynamic realignment. We
	// don't know for sure yet whether we'll need that, so we guess based
	// on whether there are any local variables that would trigger it.
	if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset))
	return false;

	// If we can reference via the stack pointer or base pointer, try that.
	// FIXME: This (and the code that resolves the references) can be improved
	// to only disallow SP relative references in the live range of
	// the VLA(s). In practice, it's unclear how much difference that
	// would make, but it may be worth doing.
	if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
	return false;

	// If even offset 0 is illegal, we don't want a virtual base register.
	if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
	return false;

	// The offset likely isn't legal; we want to allocate a virtual base register.
	return true;
	}

	bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
	Register BaseReg,
	int64_t Offset) const {
	assert(MI && "Unable to get the legal offset for nil instruction.");
	StackOffset SaveOffset(Offset, MVT::i8);
	return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
	}

	/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
	/// at the beginning of the basic block.
	void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
	Register BaseReg,
	int FrameIdx,
	int64_t Offset) const {
	MachineBasicBlock::iterator Ins = MBB->begin();
	DebugLoc DL; // Defaults to "unknown"
	if (Ins != MBB->end())
	DL = Ins->getDebugLoc();
	const MachineFunction &MF = *MBB->getParent();
	const AArch64InstrInfo *TII =
	MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
	const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
	unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);

	BuildMI(*MBB, Ins, DL, MCID, BaseReg)
	.addFrameIndex(FrameIdx)
	.addImm(Offset)
	.addImm(Shifter);
	}

	void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
	int64_t Offset) const {
	// ARM doesn't need the general 64-bit offsets
	StackOffset Off(Offset, MVT::i8);

	unsigned i = 0;

	while (!MI.getOperand(i).isFI()) {
	++i;
	assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
	}
	const MachineFunction *MF = MI.getParent()->getParent();
	const AArch64InstrInfo *TII =
	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
	bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
	assert(Done && "Unable to resolve frame index!");
	(void)Done;
	}

	// Create a scratch register for the frame index elimination in an instruction.
	// This function has special handling of stack tagging loop pseudos, in which
	// case it can also change the instruction opcode (but not the operands).
	static Register
	createScratchRegisterForInstruction(MachineInstr &MI,
	const AArch64InstrInfo *TII) {
	// ST*Gloop have a reserved scratch register in operand 1. Use it, and also
	// replace the instruction with the writeback variant because it will now
	// satisfy the operand constraints for it.
	if (MI.getOpcode() == AArch64::STGloop) {
	MI.setDesc(TII->get(AArch64::STGloop_wback));
	return MI.getOperand(1).getReg();
	} else if (MI.getOpcode() == AArch64::STZGloop) {
	MI.setDesc(TII->get(AArch64::STZGloop_wback));
	return MI.getOperand(1).getReg();
	} else {
	return MI.getMF()->getRegInfo().createVirtualRegister(
	&AArch64::GPR64RegClass);
	}
	}

	void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
	int SPAdj, unsigned FIOperandNum,
	RegScavenger *RS) const {
	assert(SPAdj == 0 && "Unexpected");

	MachineInstr &MI = *II;
	MachineBasicBlock &MBB = *MI.getParent();
	MachineFunction &MF = *MBB.getParent();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64InstrInfo *TII =
	MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
	const AArch64FrameLowering *TFI = getFrameLowering(MF);

	int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
	bool Tagged =
	MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
	Register FrameReg;

	// Special handling of dbg_value, stackmap and patchpoint instructions.
	if (MI.isDebugValue() \|\| MI.getOpcode() == TargetOpcode::STACKMAP \|\|
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	StackOffset Offset =
	TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
	/PreferFP=/true,
	/ForSimm=/false);
	Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
	MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /isDef/);
	MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
	return;
	}

	if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
	MachineOperand &FI = MI.getOperand(FIOperandNum);
	int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
	FI.ChangeToImmediate(Offset);
	return;
	}

	StackOffset Offset;
	if (MI.getOpcode() == AArch64::TAGPstack) {
	// TAGPstack must use the virtual frame register in its 3rd operand.
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	FrameReg = MI.getOperand(3).getReg();
	Offset = {MFI.getObjectOffset(FrameIndex) +
	AFI->getTaggedBasePointerOffset(),
	MVT::i8};
	} else if (Tagged) {
	StackOffset SPOffset = {
	MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
	if (MFI.hasVarSizedObjects() \|\|
	isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
	(AArch64FrameOffsetCanUpdate \| AArch64FrameOffsetIsLegal)) {
	// Can't update to SP + offset in place. Precalculate the tagged pointer
	// in a scratch register.
	Offset = TFI->resolveFrameIndexReference(
	MF, FrameIndex, FrameReg, /PreferFP=/false, /ForSimm=/true);
	Register ScratchReg =
	MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
	emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
	TII);
	BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
	.addReg(ScratchReg)
	.addReg(ScratchReg)
	.addImm(0);
	MI.getOperand(FIOperandNum)
	.ChangeToRegister(ScratchReg, false, false, true);
	return;
	}
	FrameReg = AArch64::SP;
	Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
	MVT::i8};
	} else {
	Offset = TFI->resolveFrameIndexReference(
	MF, FrameIndex, FrameReg, /PreferFP=/false, /ForSimm=/true);
	}

	// Modify MI as necessary to handle as much of 'Offset' as possible
	if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
	return;

	assert((!RS \|\| !RS->isScavengingFrameIndex(FrameIndex)) &&
	"Emergency spill slot is out of reach");

	// If we get here, the immediate doesn't fit into the instruction. We folded
	// as much as possible above. Handle the rest, providing a register that is
	// SP+LargeImm.
	Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
	emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
	MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
	}

	unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
	MachineFunction &MF) const {
	const AArch64FrameLowering *TFI = getFrameLowering(MF);

	switch (RC->getID()) {
	default:
	return 0;
	case AArch64::GPR32RegClassID:
	case AArch64::GPR32spRegClassID:
	case AArch64::GPR32allRegClassID:
	case AArch64::GPR64spRegClassID:
	case AArch64::GPR64allRegClassID:
	case AArch64::GPR64RegClassID:
	case AArch64::GPR32commonRegClassID:
	case AArch64::GPR64commonRegClassID:
	return 32 - 1 // XZR/SP
	- (TFI->hasFP(MF) \|\| TT.isOSDarwin()) // FP
	- MF.getSubtarget<AArch64Subtarget>().getNumXRegisterReserved()
	- hasBasePointer(MF); // X19
	case AArch64::FPR8RegClassID:
	case AArch64::FPR16RegClassID:
	case AArch64::FPR32RegClassID:
	case AArch64::FPR64RegClassID:
	case AArch64::FPR128RegClassID:
	return 32;

	case AArch64::DDRegClassID:
	case AArch64::DDDRegClassID:
	case AArch64::DDDDRegClassID:
	case AArch64::QQRegClassID:
	case AArch64::QQQRegClassID:
	case AArch64::QQQQRegClassID:
	return 32;

	case AArch64::FPR128_loRegClassID:
	case AArch64::FPR64_loRegClassID:
	case AArch64::FPR16_loRegClassID:
	return 16;
	}
	}

	unsigned AArch64RegisterInfo::getLocalAddressRegister(
	const MachineFunction &MF) const {
	const auto &MFI = MF.getFrameInfo();
	if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects())
	return AArch64::SP;
	else if (needsStackRealignment(MF))
	return getBaseRegister();
	return getFrameRegister(MF);
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
	index 22a8ba76c611..7b20f181e76d 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
	@@ -1,129 +1,132 @@
	//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --- C++ --==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of the MRegisterInfo class.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
	#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H

	#define GET_REGINFO_HEADER
	#include "AArch64GenRegisterInfo.inc"

	namespace llvm {

	class MachineFunction;
	class RegScavenger;
	class TargetRegisterClass;
	class Triple;

	class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
	const Triple &TT;

	public:
	AArch64RegisterInfo(const Triple &TT);

	// FIXME: This should be tablegen'd like getDwarfRegNum is
	int getSEHRegNum(unsigned i) const {
	return getEncodingValue(i);
	}

	bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
	bool isAnyArgRegReserved(const MachineFunction &MF) const;
	void emitReservedArgRegCallError(const MachineFunction &MF) const;

	void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const;
	void UpdateCustomCallPreservedMask(MachineFunction &MF,
	const uint32_t **Mask) const;

	+ static bool hasSVEArgsOrReturn(const MachineFunction *MF);
	+
	/// Code Generation virtual methods...
	const MCPhysReg getCalleeSavedRegs(const MachineFunction MF) const override;
	const MCPhysReg getDarwinCalleeSavedRegs(const MachineFunction MF) const;
	const MCPhysReg *
	getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
	const uint32_t *getCallPreservedMask(const MachineFunction &MF,
	CallingConv::ID) const override;
	const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF,
	CallingConv::ID) const;

	unsigned getCSRFirstUseCost() const override {
	// The cost will be compared against BlockFrequency where entry has the
	// value of 1 << 14. A value of 5 will choose to spill or split really
	// cold path instead of using a callee-saved register.
	return 5;
	}

	const TargetRegisterClass *
	getSubClassWithSubReg(const TargetRegisterClass *RC,
	unsigned Idx) const override;

	// Calls involved in thread-local variable lookup save more registers than
	// normal calls, so they need a different mask to represent this.
	const uint32_t *getTLSCallPreservedMask() const;

	// Funclets on ARM64 Windows don't preserve any registers.
	const uint32_t *getNoPreservedMask() const override;

	/// getThisReturnPreservedMask - Returns a call preserved mask specific to the
	/// case that 'returned' is on an i64 first argument if the calling convention
	/// is one that can (partially) model this attribute with a preserved mask
	/// (i.e. it is a calling convention that uses the same register for the first
	/// i64 argument and an i64 return value)
	///
	/// Should return NULL in the case that the calling convention does not have
	/// this property
	const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
	CallingConv::ID) const;

	/// Stack probing calls preserve different CSRs to the normal CC.
	const uint32_t *getWindowsStackProbePreservedMask() const;

	BitVector getReservedRegs(const MachineFunction &MF) const override;
	bool isAsmClobberable(const MachineFunction &MF,
	MCRegister PhysReg) const override;
	bool isConstantPhysReg(MCRegister PhysReg) const override;
	const TargetRegisterClass *
	getPointerRegClass(const MachineFunction &MF,
	unsigned Kind = 0) const override;
	const TargetRegisterClass *
	getCrossCopyRegClass(const TargetRegisterClass *RC) const override;

	bool requiresRegisterScavenging(const MachineFunction &MF) const override;
	bool useFPForScavengingIndex(const MachineFunction &MF) const override;
	bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;

	bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
	bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
	int64_t Offset) const override;
	void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
	int FrameIdx,
	int64_t Offset) const override;
	void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
	int64_t Offset) const override;
	void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
	unsigned FIOperandNum,
	RegScavenger *RS = nullptr) const override;
	bool cannotEliminateFrame(const MachineFunction &MF) const;

	bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
	bool hasBasePointer(const MachineFunction &MF) const;
	unsigned getBaseRegister() const;

	// Debug information queries.
	Register getFrameRegister(const MachineFunction &MF) const override;

	unsigned getRegPressureLimit(const TargetRegisterClass *RC,
	MachineFunction &MF) const override;

	unsigned getLocalAddressRegister(const MachineFunction &MF) const;
	+ bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
	};

	} // end namespace llvm

	#endif
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
	index bd05c56009a1..54b351fda053 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
	@@ -1,1131 +1,1134 @@
	//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -- tablegen --=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	//
	//===----------------------------------------------------------------------===//


	class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
	list<string> altNames = []>
	: Register<n, altNames> {
	let HWEncoding = enc;
	let Namespace = "AArch64";
	let SubRegs = subregs;
	}

	let Namespace = "AArch64" in {
	def sub_32 : SubRegIndex<32>;

	def bsub : SubRegIndex<8>;
	def hsub : SubRegIndex<16>;
	def ssub : SubRegIndex<32>;
	def dsub : SubRegIndex<32>;
	def sube32 : SubRegIndex<32>;
	def subo32 : SubRegIndex<32>;
	def qhisub : SubRegIndex<64>;
	def qsub : SubRegIndex<64>;
	def sube64 : SubRegIndex<64>;
	def subo64 : SubRegIndex<64>;
	// SVE
	def zsub : SubRegIndex<128>;
	// Note: zsub_hi should never be used directly because it represents
	// the scalable part of the SVE vector and cannot be manipulated as a
	// subvector in the same way the lower 128bits can.
	def zsub_hi : SubRegIndex<128>;
	// Note: Code depends on these having consecutive numbers
	def dsub0 : SubRegIndex<64>;
	def dsub1 : SubRegIndex<64>;
	def dsub2 : SubRegIndex<64>;
	def dsub3 : SubRegIndex<64>;
	// Note: Code depends on these having consecutive numbers
	def qsub0 : SubRegIndex<128>;
	def qsub1 : SubRegIndex<128>;
	def qsub2 : SubRegIndex<128>;
	def qsub3 : SubRegIndex<128>;
	}

	let Namespace = "AArch64" in {
	def vreg : RegAltNameIndex;
	def vlist1 : RegAltNameIndex;
	}

	//===----------------------------------------------------------------------===//
	// Registers
	//===----------------------------------------------------------------------===//
	def W0 : AArch64Reg<0, "w0" >, DwarfRegNum<[0]>;
	def W1 : AArch64Reg<1, "w1" >, DwarfRegNum<[1]>;
	def W2 : AArch64Reg<2, "w2" >, DwarfRegNum<[2]>;
	def W3 : AArch64Reg<3, "w3" >, DwarfRegNum<[3]>;
	def W4 : AArch64Reg<4, "w4" >, DwarfRegNum<[4]>;
	def W5 : AArch64Reg<5, "w5" >, DwarfRegNum<[5]>;
	def W6 : AArch64Reg<6, "w6" >, DwarfRegNum<[6]>;
	def W7 : AArch64Reg<7, "w7" >, DwarfRegNum<[7]>;
	def W8 : AArch64Reg<8, "w8" >, DwarfRegNum<[8]>;
	def W9 : AArch64Reg<9, "w9" >, DwarfRegNum<[9]>;
	def W10 : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
	def W11 : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
	def W12 : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
	def W13 : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
	def W14 : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
	def W15 : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
	def W16 : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
	def W17 : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
	def W18 : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
	def W19 : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
	def W20 : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
	def W21 : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
	def W22 : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
	def W23 : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
	def W24 : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
	def W25 : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
	def W26 : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
	def W27 : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
	def W28 : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
	def W29 : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
	def W30 : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
	def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
	def WZR : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;

	let SubRegIndices = [sub_32] in {
	def X0 : AArch64Reg<0, "x0", [W0]>, DwarfRegAlias<W0>;
	def X1 : AArch64Reg<1, "x1", [W1]>, DwarfRegAlias<W1>;
	def X2 : AArch64Reg<2, "x2", [W2]>, DwarfRegAlias<W2>;
	def X3 : AArch64Reg<3, "x3", [W3]>, DwarfRegAlias<W3>;
	def X4 : AArch64Reg<4, "x4", [W4]>, DwarfRegAlias<W4>;
	def X5 : AArch64Reg<5, "x5", [W5]>, DwarfRegAlias<W5>;
	def X6 : AArch64Reg<6, "x6", [W6]>, DwarfRegAlias<W6>;
	def X7 : AArch64Reg<7, "x7", [W7]>, DwarfRegAlias<W7>;
	def X8 : AArch64Reg<8, "x8", [W8]>, DwarfRegAlias<W8>;
	def X9 : AArch64Reg<9, "x9", [W9]>, DwarfRegAlias<W9>;
	def X10 : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
	def X11 : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
	def X12 : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
	def X13 : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
	def X14 : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
	def X15 : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
	def X16 : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
	def X17 : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
	def X18 : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
	def X19 : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
	def X20 : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
	def X21 : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
	def X22 : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
	def X23 : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
	def X24 : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
	def X25 : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
	def X26 : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
	def X27 : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
	def X28 : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
	def FP : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
	def LR : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
	def SP : AArch64Reg<31, "sp", [WSP]>, DwarfRegAlias<WSP>;
	def XZR : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
	}

	// Condition code register.
	def NZCV : AArch64Reg<0, "nzcv">;

	// First fault status register
	def FFR : AArch64Reg<0, "ffr">, DwarfRegNum<[47]>;

	+// Purely virtual Vector Granule (VG) Dwarf register
	+def VG : AArch64Reg<0, "vg">, DwarfRegNum<[46]>;
	+
	// GPR register classes with the intersections of GPR32/GPR32sp and
	// GPR64/GPR64sp for use by the coalescer.
	def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
	let AltOrders = [(rotl GPR32common, 8)];
	let AltOrderSelect = [{ return 1; }];
	}
	def GPR64common : RegisterClass<"AArch64", [i64], 64,
	(add (sequence "X%u", 0, 28), FP, LR)> {
	let AltOrders = [(rotl GPR64common, 8)];
	let AltOrderSelect = [{ return 1; }];
	}
	// GPR register classes which exclude SP/WSP.
	def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
	let AltOrders = [(rotl GPR32, 8)];
	let AltOrderSelect = [{ return 1; }];
	}
	def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
	let AltOrders = [(rotl GPR64, 8)];
	let AltOrderSelect = [{ return 1; }];
	}

	// GPR register classes which include SP/WSP.
	def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
	let AltOrders = [(rotl GPR32sp, 8)];
	let AltOrderSelect = [{ return 1; }];
	}
	def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
	let AltOrders = [(rotl GPR64sp, 8)];
	let AltOrderSelect = [{ return 1; }];
	}

	def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
	def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;

	def GPR64spPlus0Operand : AsmOperandClass {
	let Name = "GPR64sp0";
	let RenderMethod = "addRegOperands";
	let PredicateMethod = "isGPR64<AArch64::GPR64spRegClassID>";
	let ParserMethod = "tryParseGPR64sp0Operand";
	}

	def GPR64sp0 : RegisterOperand<GPR64sp> {
	let ParserMatchClass = GPR64spPlus0Operand;
	}

	// GPR32/GPR64 but with zero-register substitution enabled.
	// TODO: Roll this out to GPR32/GPR64/GPR32all/GPR64all.
	def GPR32z : RegisterOperand<GPR32> {
	let GIZeroRegister = WZR;
	}
	def GPR64z : RegisterOperand<GPR64> {
	let GIZeroRegister = XZR;
	}

	// GPR argument registers.
	def GPR32arg : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 7)>;
	def GPR64arg : RegisterClass<"AArch64", [i64], 64, (sequence "X%u", 0, 7)>;

	// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
	// constraint used by any instructions, it is used as a common super-class.
	def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
	def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;

	// For tail calls, we can't use callee-saved registers, as they are restored
	// to the saved value before the tail call, which would clobber a call address.
	// This is for indirect tail calls to store the address of the destination.
	def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
	X22, X23, X24, X25, X26,
	X27, X28, FP, LR)>;

	// Restricted set of tail call registers, for use when branch target
	// enforcement is enabled. These are the only registers which can be used to
	// indirectly branch (not call) to the "BTI c" instruction at the start of a
	// BTI-protected function.
	def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>;

	// Register set that excludes registers that are reserved for procedure calls.
	// This is used for pseudo-instructions that are actually implemented using a
	// procedure call.
	def GPR64noip : RegisterClass<"AArch64", [i64], 64, (sub GPR64, X16, X17, LR)>;

	// GPR register classes for post increment amount of vector load/store that
	// has alternate printing when Rm=31 and prints a constant immediate value
	// equal to the total number of bytes transferred.

	// FIXME: TableGen should be able to do these itself now. There appears to be
	// a bug in counting how many operands a Post-indexed MCInst should have which
	// means the aliases don't trigger.
	def GPR64pi1 : RegisterOperand<GPR64, "printPostIncOperand<1>">;
	def GPR64pi2 : RegisterOperand<GPR64, "printPostIncOperand<2>">;
	def GPR64pi3 : RegisterOperand<GPR64, "printPostIncOperand<3>">;
	def GPR64pi4 : RegisterOperand<GPR64, "printPostIncOperand<4>">;
	def GPR64pi6 : RegisterOperand<GPR64, "printPostIncOperand<6>">;
	def GPR64pi8 : RegisterOperand<GPR64, "printPostIncOperand<8>">;
	def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
	def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
	def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
	def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
	def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
	def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;

	// Condition code regclass.
	def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
	let CopyCost = -1; // Don't allow copying of status registers.

	// CCR is not allocatable.
	let isAllocatable = 0;
	}

	//===----------------------------------------------------------------------===//
	// Floating Point Scalar Registers
	//===----------------------------------------------------------------------===//

	def B0 : AArch64Reg<0, "b0">, DwarfRegNum<[64]>;
	def B1 : AArch64Reg<1, "b1">, DwarfRegNum<[65]>;
	def B2 : AArch64Reg<2, "b2">, DwarfRegNum<[66]>;
	def B3 : AArch64Reg<3, "b3">, DwarfRegNum<[67]>;
	def B4 : AArch64Reg<4, "b4">, DwarfRegNum<[68]>;
	def B5 : AArch64Reg<5, "b5">, DwarfRegNum<[69]>;
	def B6 : AArch64Reg<6, "b6">, DwarfRegNum<[70]>;
	def B7 : AArch64Reg<7, "b7">, DwarfRegNum<[71]>;
	def B8 : AArch64Reg<8, "b8">, DwarfRegNum<[72]>;
	def B9 : AArch64Reg<9, "b9">, DwarfRegNum<[73]>;
	def B10 : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
	def B11 : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
	def B12 : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
	def B13 : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
	def B14 : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
	def B15 : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
	def B16 : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
	def B17 : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
	def B18 : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
	def B19 : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
	def B20 : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
	def B21 : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
	def B22 : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
	def B23 : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
	def B24 : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
	def B25 : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
	def B26 : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
	def B27 : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
	def B28 : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
	def B29 : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
	def B30 : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
	def B31 : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;

	let SubRegIndices = [bsub] in {
	def H0 : AArch64Reg<0, "h0", [B0]>, DwarfRegAlias<B0>;
	def H1 : AArch64Reg<1, "h1", [B1]>, DwarfRegAlias<B1>;
	def H2 : AArch64Reg<2, "h2", [B2]>, DwarfRegAlias<B2>;
	def H3 : AArch64Reg<3, "h3", [B3]>, DwarfRegAlias<B3>;
	def H4 : AArch64Reg<4, "h4", [B4]>, DwarfRegAlias<B4>;
	def H5 : AArch64Reg<5, "h5", [B5]>, DwarfRegAlias<B5>;
	def H6 : AArch64Reg<6, "h6", [B6]>, DwarfRegAlias<B6>;
	def H7 : AArch64Reg<7, "h7", [B7]>, DwarfRegAlias<B7>;
	def H8 : AArch64Reg<8, "h8", [B8]>, DwarfRegAlias<B8>;
	def H9 : AArch64Reg<9, "h9", [B9]>, DwarfRegAlias<B9>;
	def H10 : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
	def H11 : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
	def H12 : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
	def H13 : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
	def H14 : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
	def H15 : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
	def H16 : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
	def H17 : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
	def H18 : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
	def H19 : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
	def H20 : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
	def H21 : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
	def H22 : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
	def H23 : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
	def H24 : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
	def H25 : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
	def H26 : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
	def H27 : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
	def H28 : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
	def H29 : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
	def H30 : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
	def H31 : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
	}

	let SubRegIndices = [hsub] in {
	def S0 : AArch64Reg<0, "s0", [H0]>, DwarfRegAlias<B0>;
	def S1 : AArch64Reg<1, "s1", [H1]>, DwarfRegAlias<B1>;
	def S2 : AArch64Reg<2, "s2", [H2]>, DwarfRegAlias<B2>;
	def S3 : AArch64Reg<3, "s3", [H3]>, DwarfRegAlias<B3>;
	def S4 : AArch64Reg<4, "s4", [H4]>, DwarfRegAlias<B4>;
	def S5 : AArch64Reg<5, "s5", [H5]>, DwarfRegAlias<B5>;
	def S6 : AArch64Reg<6, "s6", [H6]>, DwarfRegAlias<B6>;
	def S7 : AArch64Reg<7, "s7", [H7]>, DwarfRegAlias<B7>;
	def S8 : AArch64Reg<8, "s8", [H8]>, DwarfRegAlias<B8>;
	def S9 : AArch64Reg<9, "s9", [H9]>, DwarfRegAlias<B9>;
	def S10 : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
	def S11 : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
	def S12 : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
	def S13 : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
	def S14 : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
	def S15 : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
	def S16 : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
	def S17 : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
	def S18 : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
	def S19 : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
	def S20 : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
	def S21 : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
	def S22 : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
	def S23 : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
	def S24 : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
	def S25 : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
	def S26 : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
	def S27 : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
	def S28 : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
	def S29 : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
	def S30 : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
	def S31 : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
	}

	let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
	def D0 : AArch64Reg<0, "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
	def D1 : AArch64Reg<1, "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
	def D2 : AArch64Reg<2, "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
	def D3 : AArch64Reg<3, "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
	def D4 : AArch64Reg<4, "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
	def D5 : AArch64Reg<5, "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
	def D6 : AArch64Reg<6, "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
	def D7 : AArch64Reg<7, "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
	def D8 : AArch64Reg<8, "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
	def D9 : AArch64Reg<9, "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
	def D10 : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
	def D11 : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
	def D12 : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
	def D13 : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
	def D14 : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
	def D15 : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
	def D16 : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
	def D17 : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
	def D18 : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
	def D19 : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
	def D20 : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
	def D21 : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
	def D22 : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
	def D23 : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
	def D24 : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
	def D25 : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
	def D26 : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
	def D27 : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
	def D28 : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
	def D29 : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
	def D30 : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
	def D31 : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
	}

	let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
	def Q0 : AArch64Reg<0, "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
	def Q1 : AArch64Reg<1, "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
	def Q2 : AArch64Reg<2, "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
	def Q3 : AArch64Reg<3, "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
	def Q4 : AArch64Reg<4, "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
	def Q5 : AArch64Reg<5, "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
	def Q6 : AArch64Reg<6, "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
	def Q7 : AArch64Reg<7, "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
	def Q8 : AArch64Reg<8, "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
	def Q9 : AArch64Reg<9, "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
	def Q10 : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
	def Q11 : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
	def Q12 : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
	def Q13 : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
	def Q14 : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
	def Q15 : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
	def Q16 : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
	def Q17 : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
	def Q18 : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
	def Q19 : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
	def Q20 : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
	def Q21 : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
	def Q22 : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
	def Q23 : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
	def Q24 : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
	def Q25 : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
	def Q26 : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
	def Q27 : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
	def Q28 : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
	def Q29 : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
	def Q30 : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
	def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
	}

	def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
	let Size = 8;
	}
	def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> {
	let Size = 16;
	}

	def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> {
	let Size = 16;
	}
	def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
	def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
	v1i64, v4f16, v4bf16],
	64, (sequence "D%u", 0, 31)>;
	def FPR64_lo : RegisterClass<"AArch64",
	[v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32,
	v1f64],
	64, (trunc FPR64, 16)>;

	// We don't (yet) have an f128 legal type, so don't use that here. We
	// normalize 128-bit vectors to v2f64 for arg passing and such, so use
	// that here.
	def FPR128 : RegisterClass<"AArch64",
	[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
	v8f16, v8bf16],
	128, (sequence "Q%u", 0, 31)>;

	// The lower 16 vector registers. Some instructions can only take registers
	// in this range.
	def FPR128_lo : RegisterClass<"AArch64",
	[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16,
	v8bf16],
	128, (trunc FPR128, 16)>;

	// Pairs, triples, and quads of 64-bit vector registers.
	def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
	def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
	[(rotl FPR64, 0), (rotl FPR64, 1),
	(rotl FPR64, 2)]>;
	def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
	[(rotl FPR64, 0), (rotl FPR64, 1),
	(rotl FPR64, 2), (rotl FPR64, 3)]>;
	def DD : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
	let Size = 128;
	}
	def DDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
	let Size = 192;
	}
	def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
	let Size = 256;
	}

	// Pairs, triples, and quads of 128-bit vector registers.
	def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
	def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
	[(rotl FPR128, 0), (rotl FPR128, 1),
	(rotl FPR128, 2)]>;
	def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
	[(rotl FPR128, 0), (rotl FPR128, 1),
	(rotl FPR128, 2), (rotl FPR128, 3)]>;
	def QQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
	let Size = 256;
	}
	def QQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
	let Size = 384;
	}
	def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
	let Size = 512;
	}


	// Vector operand versions of the FP registers. Alternate name printing and
	// assembler matching.
	def VectorReg64AsmOperand : AsmOperandClass {
	let Name = "VectorReg64";
	let PredicateMethod = "isNeonVectorReg";
	}
	def VectorReg128AsmOperand : AsmOperandClass {
	let Name = "VectorReg128";
	let PredicateMethod = "isNeonVectorReg";
	}

	def V64 : RegisterOperand<FPR64, "printVRegOperand"> {
	let ParserMatchClass = VectorReg64AsmOperand;
	}

	def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
	let ParserMatchClass = VectorReg128AsmOperand;
	}

	def VectorRegLoAsmOperand : AsmOperandClass {
	let Name = "VectorRegLo";
	let PredicateMethod = "isNeonVectorRegLo";
	}
	def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
	let ParserMatchClass = VectorRegLoAsmOperand;
	}
	def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
	let ParserMatchClass = VectorRegLoAsmOperand;
	}

	class TypedVecListAsmOperand<int count, string vecty, int lanes, int eltsize>
	: AsmOperandClass {
	let Name = "TypedVectorList" # count # "_" # lanes # eltsize;

	let PredicateMethod
	= "isTypedVectorList<RegKind::NeonVector, " # count # ", " # lanes # ", " # eltsize # ">";
	let RenderMethod = "addVectorListOperands<" # vecty # ", " # count # ">";
	}

	class TypedVecListRegOperand<RegisterClass Reg, int lanes, string eltsize>
	: RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
	# eltsize # "'>">;

	multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
	// With implicit types (probably on instruction instead). E.g. { v0, v1 }
	def _64AsmOperand : AsmOperandClass {
	let Name = NAME # "64";
	let PredicateMethod = "isImplicitlyTypedVectorList<RegKind::NeonVector, " # count # ">";
	let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_DReg, " # count # ">";
	}

	def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
	}

	def _128AsmOperand : AsmOperandClass {
	let Name = NAME # "128";
	let PredicateMethod = "isImplicitlyTypedVectorList<RegKind::NeonVector, " # count # ">";
	let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_QReg, " # count # ">";
	}

	def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
	}

	// 64-bit register lists with explicit type.

	// { v0.8b, v1.8b }
	def _8bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 8, 8>;
	def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
	}

	// { v0.4h, v1.4h }
	def _4hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 4, 16>;
	def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
	}

	// { v0.2s, v1.2s }
	def _2sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 2, 32>;
	def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
	}

	// { v0.1d, v1.1d }
	def _1dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_DReg", 1, 64>;
	def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
	}

	// 128-bit register lists with explicit type

	// { v0.16b, v1.16b }
	def _16bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 16, 8>;
	def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
	}

	// { v0.8h, v1.8h }
	def _8hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 8, 16>;
	def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
	}

	// { v0.4s, v1.4s }
	def _4sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 4, 32>;
	def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
	}

	// { v0.2d, v1.2d }
	def _2dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 2, 64>;
	def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
	}

	// { v0.b, v1.b }
	def _bAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 8>;
	def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
	}

	// { v0.h, v1.h }
	def _hAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 16>;
	def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
	}

	// { v0.s, v1.s }
	def _sAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 32>;
	def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
	}

	// { v0.d, v1.d }
	def _dAsmOperand : TypedVecListAsmOperand<count, "AArch64Operand::VecListIdx_QReg", 0, 64>;
	def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
	}


	}

	defm VecListOne : VectorList<1, FPR64, FPR128>;
	defm VecListTwo : VectorList<2, DD, QQ>;
	defm VecListThree : VectorList<3, DDD, QQQ>;
	defm VecListFour : VectorList<4, DDDD, QQQQ>;

	class FPRAsmOperand<string RC> : AsmOperandClass {
	let Name = "FPRAsmOperand" # RC;
	let PredicateMethod = "isGPR64<AArch64::" # RC # "RegClassID>";
	let RenderMethod = "addRegOperands";
	}

	// Register operand versions of the scalar FP registers.
	def FPR8Op : RegisterOperand<FPR8, "printOperand"> {
	let ParserMatchClass = FPRAsmOperand<"FPR8">;
	}

	def FPR16Op : RegisterOperand<FPR16, "printOperand"> {
	let ParserMatchClass = FPRAsmOperand<"FPR16">;
	}

	def FPR16Op_lo : RegisterOperand<FPR16_lo, "printOperand"> {
	let ParserMatchClass = FPRAsmOperand<"FPR16_lo">;
	}

	def FPR32Op : RegisterOperand<FPR32, "printOperand"> {
	let ParserMatchClass = FPRAsmOperand<"FPR32">;
	}

	def FPR64Op : RegisterOperand<FPR64, "printOperand"> {
	let ParserMatchClass = FPRAsmOperand<"FPR64">;
	}

	def FPR128Op : RegisterOperand<FPR128, "printOperand"> {
	let ParserMatchClass = FPRAsmOperand<"FPR128">;
	}

	//===----------------------------------------------------------------------===//
	// ARMv8.1a atomic CASP register operands


	def WSeqPairs : RegisterTuples<[sube32, subo32],
	[(decimate (rotl GPR32, 0), 2),
	(decimate (rotl GPR32, 1), 2)]>;
	def XSeqPairs : RegisterTuples<[sube64, subo64],
	[(decimate (rotl GPR64, 0), 2),
	(decimate (rotl GPR64, 1), 2)]>;

	def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32,
	(add WSeqPairs)>{
	let Size = 64;
	}
	def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64,
	(add XSeqPairs)>{
	let Size = 128;
	}


	let RenderMethod = "addRegOperands", ParserMethod="tryParseGPRSeqPair" in {
	def WSeqPairsAsmOperandClass : AsmOperandClass { let Name = "WSeqPair"; }
	def XSeqPairsAsmOperandClass : AsmOperandClass { let Name = "XSeqPair"; }
	}

	def WSeqPairClassOperand :
	RegisterOperand<WSeqPairsClass, "printGPRSeqPairsClassOperand<32>"> {
	let ParserMatchClass = WSeqPairsAsmOperandClass;
	}
	def XSeqPairClassOperand :
	RegisterOperand<XSeqPairsClass, "printGPRSeqPairsClassOperand<64>"> {
	let ParserMatchClass = XSeqPairsAsmOperandClass;
	}


	//===----- END: v8.1a atomic CASP register operands -----------------------===//

	// SVE predicate registers
	def P0 : AArch64Reg<0, "p0">, DwarfRegNum<[48]>;
	def P1 : AArch64Reg<1, "p1">, DwarfRegNum<[49]>;
	def P2 : AArch64Reg<2, "p2">, DwarfRegNum<[50]>;
	def P3 : AArch64Reg<3, "p3">, DwarfRegNum<[51]>;
	def P4 : AArch64Reg<4, "p4">, DwarfRegNum<[52]>;
	def P5 : AArch64Reg<5, "p5">, DwarfRegNum<[53]>;
	def P6 : AArch64Reg<6, "p6">, DwarfRegNum<[54]>;
	def P7 : AArch64Reg<7, "p7">, DwarfRegNum<[55]>;
	def P8 : AArch64Reg<8, "p8">, DwarfRegNum<[56]>;
	def P9 : AArch64Reg<9, "p9">, DwarfRegNum<[57]>;
	def P10 : AArch64Reg<10, "p10">, DwarfRegNum<[58]>;
	def P11 : AArch64Reg<11, "p11">, DwarfRegNum<[59]>;
	def P12 : AArch64Reg<12, "p12">, DwarfRegNum<[60]>;
	def P13 : AArch64Reg<13, "p13">, DwarfRegNum<[61]>;
	def P14 : AArch64Reg<14, "p14">, DwarfRegNum<[62]>;
	def P15 : AArch64Reg<15, "p15">, DwarfRegNum<[63]>;

	// The part of SVE registers that don't overlap Neon registers.
	// These are only used as part of clobber lists.
	def Z0_HI : AArch64Reg<0, "z0_hi">;
	def Z1_HI : AArch64Reg<1, "z1_hi">;
	def Z2_HI : AArch64Reg<2, "z2_hi">;
	def Z3_HI : AArch64Reg<3, "z3_hi">;
	def Z4_HI : AArch64Reg<4, "z4_hi">;
	def Z5_HI : AArch64Reg<5, "z5_hi">;
	def Z6_HI : AArch64Reg<6, "z6_hi">;
	def Z7_HI : AArch64Reg<7, "z7_hi">;
	def Z8_HI : AArch64Reg<8, "z8_hi">;
	def Z9_HI : AArch64Reg<9, "z9_hi">;
	def Z10_HI : AArch64Reg<10, "z10_hi">;
	def Z11_HI : AArch64Reg<11, "z11_hi">;
	def Z12_HI : AArch64Reg<12, "z12_hi">;
	def Z13_HI : AArch64Reg<13, "z13_hi">;
	def Z14_HI : AArch64Reg<14, "z14_hi">;
	def Z15_HI : AArch64Reg<15, "z15_hi">;
	def Z16_HI : AArch64Reg<16, "z16_hi">;
	def Z17_HI : AArch64Reg<17, "z17_hi">;
	def Z18_HI : AArch64Reg<18, "z18_hi">;
	def Z19_HI : AArch64Reg<19, "z19_hi">;
	def Z20_HI : AArch64Reg<20, "z20_hi">;
	def Z21_HI : AArch64Reg<21, "z21_hi">;
	def Z22_HI : AArch64Reg<22, "z22_hi">;
	def Z23_HI : AArch64Reg<23, "z23_hi">;
	def Z24_HI : AArch64Reg<24, "z24_hi">;
	def Z25_HI : AArch64Reg<25, "z25_hi">;
	def Z26_HI : AArch64Reg<26, "z26_hi">;
	def Z27_HI : AArch64Reg<27, "z27_hi">;
	def Z28_HI : AArch64Reg<28, "z28_hi">;
	def Z29_HI : AArch64Reg<29, "z29_hi">;
	def Z30_HI : AArch64Reg<30, "z30_hi">;
	def Z31_HI : AArch64Reg<31, "z31_hi">;

	// SVE variable-size vector registers
	let SubRegIndices = [zsub,zsub_hi] in {
	def Z0 : AArch64Reg<0, "z0", [Q0, Z0_HI]>, DwarfRegNum<[96]>;
	def Z1 : AArch64Reg<1, "z1", [Q1, Z1_HI]>, DwarfRegNum<[97]>;
	def Z2 : AArch64Reg<2, "z2", [Q2, Z2_HI]>, DwarfRegNum<[98]>;
	def Z3 : AArch64Reg<3, "z3", [Q3, Z3_HI]>, DwarfRegNum<[99]>;
	def Z4 : AArch64Reg<4, "z4", [Q4, Z4_HI]>, DwarfRegNum<[100]>;
	def Z5 : AArch64Reg<5, "z5", [Q5, Z5_HI]>, DwarfRegNum<[101]>;
	def Z6 : AArch64Reg<6, "z6", [Q6, Z6_HI]>, DwarfRegNum<[102]>;
	def Z7 : AArch64Reg<7, "z7", [Q7, Z7_HI]>, DwarfRegNum<[103]>;
	def Z8 : AArch64Reg<8, "z8", [Q8, Z8_HI]>, DwarfRegNum<[104]>;
	def Z9 : AArch64Reg<9, "z9", [Q9, Z9_HI]>, DwarfRegNum<[105]>;
	def Z10 : AArch64Reg<10, "z10", [Q10, Z10_HI]>, DwarfRegNum<[106]>;
	def Z11 : AArch64Reg<11, "z11", [Q11, Z11_HI]>, DwarfRegNum<[107]>;
	def Z12 : AArch64Reg<12, "z12", [Q12, Z12_HI]>, DwarfRegNum<[108]>;
	def Z13 : AArch64Reg<13, "z13", [Q13, Z13_HI]>, DwarfRegNum<[109]>;
	def Z14 : AArch64Reg<14, "z14", [Q14, Z14_HI]>, DwarfRegNum<[110]>;
	def Z15 : AArch64Reg<15, "z15", [Q15, Z15_HI]>, DwarfRegNum<[111]>;
	def Z16 : AArch64Reg<16, "z16", [Q16, Z16_HI]>, DwarfRegNum<[112]>;
	def Z17 : AArch64Reg<17, "z17", [Q17, Z17_HI]>, DwarfRegNum<[113]>;
	def Z18 : AArch64Reg<18, "z18", [Q18, Z18_HI]>, DwarfRegNum<[114]>;
	def Z19 : AArch64Reg<19, "z19", [Q19, Z19_HI]>, DwarfRegNum<[115]>;
	def Z20 : AArch64Reg<20, "z20", [Q20, Z20_HI]>, DwarfRegNum<[116]>;
	def Z21 : AArch64Reg<21, "z21", [Q21, Z21_HI]>, DwarfRegNum<[117]>;
	def Z22 : AArch64Reg<22, "z22", [Q22, Z22_HI]>, DwarfRegNum<[118]>;
	def Z23 : AArch64Reg<23, "z23", [Q23, Z23_HI]>, DwarfRegNum<[119]>;
	def Z24 : AArch64Reg<24, "z24", [Q24, Z24_HI]>, DwarfRegNum<[120]>;
	def Z25 : AArch64Reg<25, "z25", [Q25, Z25_HI]>, DwarfRegNum<[121]>;
	def Z26 : AArch64Reg<26, "z26", [Q26, Z26_HI]>, DwarfRegNum<[122]>;
	def Z27 : AArch64Reg<27, "z27", [Q27, Z27_HI]>, DwarfRegNum<[123]>;
	def Z28 : AArch64Reg<28, "z28", [Q28, Z28_HI]>, DwarfRegNum<[124]>;
	def Z29 : AArch64Reg<29, "z29", [Q29, Z29_HI]>, DwarfRegNum<[125]>;
	def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>;
	def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>;
	}

	// Enum describing the element size for destructive
	// operations.
	class ElementSizeEnum<bits<3> val> {
	bits<3> Value = val;
	}

	def ElementSizeNone : ElementSizeEnum<0>;
	def ElementSizeB : ElementSizeEnum<1>;
	def ElementSizeH : ElementSizeEnum<2>;
	def ElementSizeS : ElementSizeEnum<3>;
	def ElementSizeD : ElementSizeEnum<4>;
	def ElementSizeQ : ElementSizeEnum<5>; // Unused

	class SVERegOp <string Suffix, AsmOperandClass C,
	ElementSizeEnum Size,
	RegisterClass RC> : RegisterOperand<RC> {
	ElementSizeEnum ElementSize;

	let ElementSize = Size;
	let PrintMethod = !if(!eq(Suffix, ""),
	"printSVERegOp<>",
	"printSVERegOp<'" # Suffix # "'>");
	let ParserMatchClass = C;
	}

	class PPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
	RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {}
	class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
	RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {}

	//******************************************************************************

	// SVE predicate register classes.
	class PPRClass<int lastreg> : RegisterClass<
	"AArch64",
	[ nxv16i1, nxv8i1, nxv4i1, nxv2i1 ], 16,
	(sequence "P%u", 0, lastreg)> {
	let Size = 16;
	}

	def PPR : PPRClass<15>;
	def PPR_3b : PPRClass<7>; // Restricted 3 bit SVE predicate register class.

	class PPRAsmOperand <string name, string RegClass, int Width>: AsmOperandClass {
	let Name = "SVE" # name # "Reg";
	let PredicateMethod = "isSVEPredicateVectorRegOfWidth<"
	# Width # ", " # "AArch64::" # RegClass # "RegClassID>";
	let DiagnosticType = "InvalidSVE" # name # "Reg";
	let RenderMethod = "addRegOperands";
	let ParserMethod = "tryParseSVEPredicateVector";
	}

	def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR", 0>;
	def PPRAsmOp8 : PPRAsmOperand<"PredicateB", "PPR", 8>;
	def PPRAsmOp16 : PPRAsmOperand<"PredicateH", "PPR", 16>;
	def PPRAsmOp32 : PPRAsmOperand<"PredicateS", "PPR", 32>;
	def PPRAsmOp64 : PPRAsmOperand<"PredicateD", "PPR", 64>;

	def PPRAny : PPRRegOp<"", PPRAsmOpAny, ElementSizeNone, PPR>;
	def PPR8 : PPRRegOp<"b", PPRAsmOp8, ElementSizeB, PPR>;
	def PPR16 : PPRRegOp<"h", PPRAsmOp16, ElementSizeH, PPR>;
	def PPR32 : PPRRegOp<"s", PPRAsmOp32, ElementSizeS, PPR>;
	def PPR64 : PPRRegOp<"d", PPRAsmOp64, ElementSizeD, PPR>;

	def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", 0>;
	def PPRAsmOp3b8 : PPRAsmOperand<"Predicate3bB", "PPR_3b", 8>;
	def PPRAsmOp3b16 : PPRAsmOperand<"Predicate3bH", "PPR_3b", 16>;
	def PPRAsmOp3b32 : PPRAsmOperand<"Predicate3bS", "PPR_3b", 32>;
	def PPRAsmOp3b64 : PPRAsmOperand<"Predicate3bD", "PPR_3b", 64>;

	def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, ElementSizeNone, PPR_3b>;
	def PPR3b8 : PPRRegOp<"b", PPRAsmOp3b8, ElementSizeB, PPR_3b>;
	def PPR3b16 : PPRRegOp<"h", PPRAsmOp3b16, ElementSizeH, PPR_3b>;
	def PPR3b32 : PPRRegOp<"s", PPRAsmOp3b32, ElementSizeS, PPR_3b>;
	def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>;

	//******************************************************************************

	// SVE vector register classes
	class ZPRClass<int lastreg> : RegisterClass<"AArch64",
	[nxv16i8, nxv8i16, nxv4i32, nxv2i64,
	nxv2f16, nxv4f16, nxv8f16,
	nxv2bf16, nxv4bf16, nxv8bf16,
	nxv2f32, nxv4f32,
	nxv2f64],
	128, (sequence "Z%u", 0, lastreg)> {
	let Size = 128;
	}

	def ZPR : ZPRClass<31>;
	def ZPR_4b : ZPRClass<15>; // Restricted 4 bit SVE vector register class.
	def ZPR_3b : ZPRClass<7>; // Restricted 3 bit SVE vector register class.

	class ZPRAsmOperand<string name, int Width, string RegClassSuffix = "">
	: AsmOperandClass {
	let Name = "SVE" # name # "Reg";
	let PredicateMethod = "isSVEDataVectorRegOfWidth<"
	# Width # ", AArch64::ZPR"
	# RegClassSuffix # "RegClassID>";
	let RenderMethod = "addRegOperands";
	let DiagnosticType = "InvalidZPR" # RegClassSuffix # Width;
	let ParserMethod = "tryParseSVEDataVector<false, "
	# !if(!eq(Width, 0), "false", "true") # ">";
	}

	def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", 0>;
	def ZPRAsmOp8 : ZPRAsmOperand<"VectorB", 8>;
	def ZPRAsmOp16 : ZPRAsmOperand<"VectorH", 16>;
	def ZPRAsmOp32 : ZPRAsmOperand<"VectorS", 32>;
	def ZPRAsmOp64 : ZPRAsmOperand<"VectorD", 64>;
	def ZPRAsmOp128 : ZPRAsmOperand<"VectorQ", 128>;

	def ZPRAny : ZPRRegOp<"", ZPRAsmOpAny, ElementSizeNone, ZPR>;
	def ZPR8 : ZPRRegOp<"b", ZPRAsmOp8, ElementSizeB, ZPR>;
	def ZPR16 : ZPRRegOp<"h", ZPRAsmOp16, ElementSizeH, ZPR>;
	def ZPR32 : ZPRRegOp<"s", ZPRAsmOp32, ElementSizeS, ZPR>;
	def ZPR64 : ZPRRegOp<"d", ZPRAsmOp64, ElementSizeD, ZPR>;
	def ZPR128 : ZPRRegOp<"q", ZPRAsmOp128, ElementSizeQ, ZPR>;

	def ZPRAsmOp3b8 : ZPRAsmOperand<"Vector3bB", 8, "_3b">;
	def ZPRAsmOp3b16 : ZPRAsmOperand<"Vector3bH", 16, "_3b">;
	def ZPRAsmOp3b32 : ZPRAsmOperand<"Vector3bS", 32, "_3b">;

	def ZPR3b8 : ZPRRegOp<"b", ZPRAsmOp3b8, ElementSizeB, ZPR_3b>;
	def ZPR3b16 : ZPRRegOp<"h", ZPRAsmOp3b16, ElementSizeH, ZPR_3b>;
	def ZPR3b32 : ZPRRegOp<"s", ZPRAsmOp3b32, ElementSizeS, ZPR_3b>;

	def ZPRAsmOp4b16 : ZPRAsmOperand<"Vector4bH", 16, "_4b">;
	def ZPRAsmOp4b32 : ZPRAsmOperand<"Vector4bS", 32, "_4b">;
	def ZPRAsmOp4b64 : ZPRAsmOperand<"Vector4bD", 64, "_4b">;

	def ZPR4b16 : ZPRRegOp<"h", ZPRAsmOp4b16, ElementSizeH, ZPR_4b>;
	def ZPR4b32 : ZPRRegOp<"s", ZPRAsmOp4b32, ElementSizeS, ZPR_4b>;
	def ZPR4b64 : ZPRRegOp<"d", ZPRAsmOp4b64, ElementSizeD, ZPR_4b>;

	class FPRasZPR<int Width> : AsmOperandClass{
	let Name = "FPR" # Width # "asZPR";
	let PredicateMethod = "isFPRasZPR<AArch64::FPR" # Width # "RegClassID>";
	let RenderMethod = "addFPRasZPRRegOperands<" # Width # ">";
	}

	class FPRasZPROperand<int Width> : RegisterOperand<ZPR> {
	let ParserMatchClass = FPRasZPR<Width>;
	let PrintMethod = "printZPRasFPR<" # Width # ">";
	}

	def FPR8asZPR : FPRasZPROperand<8>;
	def FPR16asZPR : FPRasZPROperand<16>;
	def FPR32asZPR : FPRasZPROperand<32>;
	def FPR64asZPR : FPRasZPROperand<64>;
	def FPR128asZPR : FPRasZPROperand<128>;

	let Namespace = "AArch64" in {
	def zsub0 : SubRegIndex<128, -1>;
	def zsub1 : SubRegIndex<128, -1>;
	def zsub2 : SubRegIndex<128, -1>;
	def zsub3 : SubRegIndex<128, -1>;
	}

	// Pairs, triples, and quads of SVE vector registers.
	def ZSeqPairs : RegisterTuples<[zsub0, zsub1], [(rotl ZPR, 0), (rotl ZPR, 1)]>;
	def ZSeqTriples : RegisterTuples<[zsub0, zsub1, zsub2], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2)]>;
	def ZSeqQuads : RegisterTuples<[zsub0, zsub1, zsub2, zsub3], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2), (rotl ZPR, 3)]>;

	def ZPR2 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqPairs)> {
	let Size = 256;
	}
	def ZPR3 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqTriples)> {
	let Size = 384;
	}
	def ZPR4 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqQuads)> {
	let Size = 512;
	}

	class ZPRVectorList<int ElementWidth, int NumRegs> : AsmOperandClass {
	let Name = "SVEVectorList" # NumRegs # ElementWidth;
	let ParserMethod = "tryParseVectorList<RegKind::SVEDataVector>";
	let PredicateMethod =
	"isTypedVectorList<RegKind::SVEDataVector, " #NumRegs #", 0, " #ElementWidth #">";
	let RenderMethod = "addVectorListOperands<AArch64Operand::VecListIdx_ZReg, " # NumRegs # ">";
	}

	def Z_b : RegisterOperand<ZPR, "printTypedVectorList<0,'b'>"> {
	let ParserMatchClass = ZPRVectorList<8, 1>;
	}

	def Z_h : RegisterOperand<ZPR, "printTypedVectorList<0,'h'>"> {
	let ParserMatchClass = ZPRVectorList<16, 1>;
	}

	def Z_s : RegisterOperand<ZPR, "printTypedVectorList<0,'s'>"> {
	let ParserMatchClass = ZPRVectorList<32, 1>;
	}

	def Z_d : RegisterOperand<ZPR, "printTypedVectorList<0,'d'>"> {
	let ParserMatchClass = ZPRVectorList<64, 1>;
	}

	def ZZ_b : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> {
	let ParserMatchClass = ZPRVectorList<8, 2>;
	}

	def ZZ_h : RegisterOperand<ZPR2, "printTypedVectorList<0,'h'>"> {
	let ParserMatchClass = ZPRVectorList<16, 2>;
	}

	def ZZ_s : RegisterOperand<ZPR2, "printTypedVectorList<0,'s'>"> {
	let ParserMatchClass = ZPRVectorList<32, 2>;
	}

	def ZZ_d : RegisterOperand<ZPR2, "printTypedVectorList<0,'d'>"> {
	let ParserMatchClass = ZPRVectorList<64, 2>;
	}

	def ZZZ_b : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> {
	let ParserMatchClass = ZPRVectorList<8, 3>;
	}

	def ZZZ_h : RegisterOperand<ZPR3, "printTypedVectorList<0,'h'>"> {
	let ParserMatchClass = ZPRVectorList<16, 3>;
	}

	def ZZZ_s : RegisterOperand<ZPR3, "printTypedVectorList<0,'s'>"> {
	let ParserMatchClass = ZPRVectorList<32, 3>;
	}

	def ZZZ_d : RegisterOperand<ZPR3, "printTypedVectorList<0,'d'>"> {
	let ParserMatchClass = ZPRVectorList<64, 3>;
	}

	def ZZZZ_b : RegisterOperand<ZPR4, "printTypedVectorList<0,'b'>"> {
	let ParserMatchClass = ZPRVectorList<8, 4>;
	}

	def ZZZZ_h : RegisterOperand<ZPR4, "printTypedVectorList<0,'h'>"> {
	let ParserMatchClass = ZPRVectorList<16, 4>;
	}

	def ZZZZ_s : RegisterOperand<ZPR4, "printTypedVectorList<0,'s'>"> {
	let ParserMatchClass = ZPRVectorList<32, 4>;
	}

	def ZZZZ_d : RegisterOperand<ZPR4, "printTypedVectorList<0,'d'>"> {
	let ParserMatchClass = ZPRVectorList<64, 4>;
	}

	class ZPRExtendAsmOperand<string ShiftExtend, int RegWidth, int Scale,
	bit ScaleAlwaysSame = 0b0> : AsmOperandClass {
	let Name = "ZPRExtend" # ShiftExtend # RegWidth # Scale
	# !if(ScaleAlwaysSame, "Only", "");

	let PredicateMethod = "isSVEDataVectorRegWithShiftExtend<"
	# RegWidth # ", AArch64::ZPRRegClassID, "
	# "AArch64_AM::" # ShiftExtend # ", "
	# Scale # ", "
	# !if(ScaleAlwaysSame, "true", "false")
	# ">";
	let DiagnosticType = "InvalidZPR" # RegWidth # ShiftExtend # Scale;
	let RenderMethod = "addRegOperands";
	let ParserMethod = "tryParseSVEDataVector<true, true>";
	}

	class ZPRExtendRegisterOperand<bit SignExtend, bit IsLSL, string Repr,
	int RegWidth, int Scale, string Suffix = "">
	: RegisterOperand<ZPR> {
	let ParserMatchClass =
	!cast<AsmOperandClass>("ZPR" # RegWidth # "AsmOpndExt" # Repr # Scale # Suffix);
	let PrintMethod = "printRegWithShiftExtend<"
	# !if(SignExtend, "true", "false") # ", "
	# Scale # ", "
	# !if(IsLSL, "'x'", "'w'") # ", "
	# !if(!eq(RegWidth, 32), "'s'", "'d'") # ">";
	}

	foreach RegWidth = [32, 64] in {
	// UXTW(8\|16\|32\|64)
	def ZPR#RegWidth#AsmOpndExtUXTW8Only : ZPRExtendAsmOperand<"UXTW", RegWidth, 8, 0b1>;
	def ZPR#RegWidth#AsmOpndExtUXTW8 : ZPRExtendAsmOperand<"UXTW", RegWidth, 8>;
	def ZPR#RegWidth#AsmOpndExtUXTW16 : ZPRExtendAsmOperand<"UXTW", RegWidth, 16>;
	def ZPR#RegWidth#AsmOpndExtUXTW32 : ZPRExtendAsmOperand<"UXTW", RegWidth, 32>;
	def ZPR#RegWidth#AsmOpndExtUXTW64 : ZPRExtendAsmOperand<"UXTW", RegWidth, 64>;

	def ZPR#RegWidth#ExtUXTW8Only : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 8, "Only">;
	def ZPR#RegWidth#ExtUXTW8 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 8>;
	def ZPR#RegWidth#ExtUXTW16 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 16>;
	def ZPR#RegWidth#ExtUXTW32 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 32>;
	def ZPR#RegWidth#ExtUXTW64 : ZPRExtendRegisterOperand<0b0, 0b0, "UXTW", RegWidth, 64>;

	// SXTW(8\|16\|32\|64)
	def ZPR#RegWidth#AsmOpndExtSXTW8Only : ZPRExtendAsmOperand<"SXTW", RegWidth, 8, 0b1>;
	def ZPR#RegWidth#AsmOpndExtSXTW8 : ZPRExtendAsmOperand<"SXTW", RegWidth, 8>;
	def ZPR#RegWidth#AsmOpndExtSXTW16 : ZPRExtendAsmOperand<"SXTW", RegWidth, 16>;
	def ZPR#RegWidth#AsmOpndExtSXTW32 : ZPRExtendAsmOperand<"SXTW", RegWidth, 32>;
	def ZPR#RegWidth#AsmOpndExtSXTW64 : ZPRExtendAsmOperand<"SXTW", RegWidth, 64>;

	def ZPR#RegWidth#ExtSXTW8Only : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 8, "Only">;
	def ZPR#RegWidth#ExtSXTW8 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 8>;
	def ZPR#RegWidth#ExtSXTW16 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 16>;
	def ZPR#RegWidth#ExtSXTW32 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 32>;
	def ZPR#RegWidth#ExtSXTW64 : ZPRExtendRegisterOperand<0b1, 0b0, "SXTW", RegWidth, 64>;

	// LSL(8\|16\|32\|64)
	def ZPR#RegWidth#AsmOpndExtLSL8 : ZPRExtendAsmOperand<"LSL", RegWidth, 8>;
	def ZPR#RegWidth#AsmOpndExtLSL16 : ZPRExtendAsmOperand<"LSL", RegWidth, 16>;
	def ZPR#RegWidth#AsmOpndExtLSL32 : ZPRExtendAsmOperand<"LSL", RegWidth, 32>;
	def ZPR#RegWidth#AsmOpndExtLSL64 : ZPRExtendAsmOperand<"LSL", RegWidth, 64>;
	def ZPR#RegWidth#ExtLSL8 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 8>;
	def ZPR#RegWidth#ExtLSL16 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 16>;
	def ZPR#RegWidth#ExtLSL32 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 32>;
	def ZPR#RegWidth#ExtLSL64 : ZPRExtendRegisterOperand<0b0, 0b1, "LSL", RegWidth, 64>;
	}

	class GPR64ShiftExtendAsmOperand <string AsmOperandName, int Scale, string RegClass> : AsmOperandClass {
	let Name = AsmOperandName # Scale;
	let PredicateMethod = "isGPR64WithShiftExtend<AArch64::"#RegClass#"RegClassID, " # Scale # ">";
	let DiagnosticType = "Invalid" # AsmOperandName # Scale;
	let RenderMethod = "addRegOperands";
	let ParserMethod = "tryParseGPROperand<true>";
	}

	class GPR64ExtendRegisterOperand<string Name, int Scale, RegisterClass RegClass> : RegisterOperand<RegClass>{
	let ParserMatchClass = !cast<AsmOperandClass>(Name);
	let PrintMethod = "printRegWithShiftExtend<false, " # Scale # ", 'x', 0>";
	}

	foreach Scale = [8, 16, 32, 64] in {
	def GPR64shiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64shifted", Scale, "GPR64">;
	def GPR64shifted # Scale : GPR64ExtendRegisterOperand<"GPR64shiftedAsmOpnd" # Scale, Scale, GPR64>;

	def GPR64NoXZRshiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64NoXZRshifted", Scale, "GPR64common">;
	def GPR64NoXZRshifted # Scale : GPR64ExtendRegisterOperand<"GPR64NoXZRshiftedAsmOpnd" # Scale, Scale, GPR64common>;
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
	index 3449a8bd16d2..4f29f2f18185 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
	@@ -1,2605 +1,2605 @@
	//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -- tablegen ------=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
	//
	//===----------------------------------------------------------------------===//

	// For predicated nodes where the entire operation is controlled by a governing
	// predicate, please stick to a similar naming convention as used for the
	// ISD nodes:
	//
	// SDNode <=> AArch64ISD
	// -------------------------------
	// _m<n> <=> _MERGE_OP<n>
	// _mt <=> _MERGE_PASSTHRU
	// _z <=> _MERGE_ZERO
	// _p <=> _PRED
	//
	// Given the context of this file, it is not strictly necessary to use _p to
	// distinguish predicated from unpredicated nodes given that most SVE
	// instructions are predicated.

	// Contiguous loads - node definitions
	//
	def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
	def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;

	// Non-faulting & first-faulting loads - node definitions
	//
	def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	// Contiguous load and replicate - node definitions
	//

	def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;

	// Gather loads - node definitions
	//
	def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;

	def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;

	def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;

	def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
	def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;

	// Contiguous stores - node definitions
	//
	def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [
	SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>,
	SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2>
	]>;

	def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>;

	// Scatter stores - node definitions
	//
	def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
	SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
	]>;

	def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
	def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;

	def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;

	// AArch64 SVE/SVE2 - the remaining node definitions
	//

	// SVE CNT/INC/RDVL
	def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
	def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
	def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">;
	def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">;

	// SVE DEC
	def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
	def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
	def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;

	def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
	def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>;
	def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>;
	def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
	def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>;
	def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
	def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
	def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
	def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
	def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
	def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
	def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
	def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
	def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
	def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;

	def SDT_AArch64Arith : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
	SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
	]>;

	def SDT_AArch64FMA : SDTypeProfile<1, 4, [
	SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
	SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
	]>;

	// Predicated operations with the result of inactive lanes being unspecified.
	def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
	def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
	def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>;
	def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
	def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;

	// Merging op1 into the inactive lanes.
	def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>;
	def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>;

	def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
	def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
	def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
	def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;

	def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;

	def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;

	def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
	def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;

	def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>;
	def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;

	def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;

	let Predicates = [HasSVE] in {
	defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
	def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
	defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
	def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
	def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;

	defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>;
	defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>;
	defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
	defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
	defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
	defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;

	defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
	defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
	defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
	defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;

	defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
	defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
	defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /isReverseInstr/ 1>;

	defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>;

	let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
	defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
	defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
	defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
	}

	defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
	defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
	defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
	defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;

	defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>;
	defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>;
	defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
	defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
	defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
	defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
	defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;

	defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
	defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
	defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
	defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;

	// SVE predicated integer reductions.
	defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
	defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
	defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
	defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
	defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
	defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>;
	defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
	defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
	defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;

	defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
	defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
	defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;

	defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
	defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
	defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
	defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;

	defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
	defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
	defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
	defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;

	// Add unpredicated alternative for the mul instruction.
	def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
	(MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
	def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
	(MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
	def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
	(MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
	def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
	(MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;

	defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
	defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
	defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /isReverseInstr/ 1>;
	defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /isReverseInstr/ 1>;

	defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>;
	defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>;

	defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
	defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;

	defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
	defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;

	defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
	defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
	defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
	defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
	defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
	defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
	defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
	defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;

	defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>;
	defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>;
	defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
	}

	defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>;
	defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>;
	defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
	defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;

	defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
	defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
	defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
	defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
	defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
	defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;

	defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>;
	defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;

	defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
	defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
	defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
	defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
	defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
	defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
	defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
	defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;

	defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
	defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
	defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
	defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /isReverseInstr/ 1>;
	defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
	defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
	defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
	defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
	defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;
	defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
	defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;
	defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /isReverseInstr/ 1>;
	defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;

	defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>;

	let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
	defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
	defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
	defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
	defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
	defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
	defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
	defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
	defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
	defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
	defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
	defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
	defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
	}

	defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
	defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
	defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul>;
	defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
	defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>;
	defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;

	defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;

	defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
	defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;

	defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
	defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
	defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
	defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;

	defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
	defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
	defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
	defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;

	// Add patterns for FMA where disabled lanes are undef.
	// FIXME: Implement a pseudo so we can choose a better instruction after
	// regalloc.
	def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
	(FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
	def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
	(FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
	def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
	(FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;

	defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;

	defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
	defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;

	defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
	defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;

	// SVE floating point reductions.
	defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;
	defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>;
	defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
	defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
	defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>;
	defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;

	// Use more efficient NEON instructions to extract elements within the NEON
	// part (first 128bits) of an SVE register.
	def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
	(f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
	def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
	(f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
	def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
	(f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;

	// Splat immediate (unpredicated)
	defm DUP_ZI : sve_int_dup_imm<"dup">;
	defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
	defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;

	// Splat immediate (predicated)
	defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
	defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
	defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;

	// Splat scalar register (unpredicated, GPR or vector + element index)
	defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>;
	defm DUP_ZZI : sve_int_perm_dup_i<"dup">;

	// Splat scalar register (predicated)
	defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
	defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;

	let Predicates = [HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
	(CPY_ZPmV_H $passthru, $pg, $splat)>;
	}

	// Duplicate FP scalar into all vector elements
	def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
	(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
	def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
	(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
	def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
	(DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
	let Predicates = [HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
	(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
	}

	// Duplicate +0.0 into all vector elements
	def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
	def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
	def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
	def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
	def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
	def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
	let Predicates = [HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
	}

	// Duplicate Int immediate into all vector elements
	def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_B $a, $b)>;
	def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_H $a, $b)>;
	def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_S $a, $b)>;
	def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
	(DUP_ZI_D $a, $b)>;

	// Duplicate FP immediate into all vector elements
	let AddedComplexity = 2 in {
	def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
	(FDUP_ZI_H fpimm16:$imm8)>;
	def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
	(FDUP_ZI_H fpimm16:$imm8)>;
	def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
	(FDUP_ZI_H fpimm16:$imm8)>;
	def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
	(FDUP_ZI_S fpimm32:$imm8)>;
	def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
	(FDUP_ZI_S fpimm32:$imm8)>;
	def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
	(FDUP_ZI_D fpimm64:$imm8)>;
	}

	// Select elements from either vector (predicated)
	defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;

	defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
	}

	defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
	defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
	defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
	defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
	}

	defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
	defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
	defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
	defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;

	defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
	defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
	}

	defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
	defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
	defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
	defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>;

	defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
	defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;

	defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
	defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
	def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
	defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;

	defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>;
	defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
	defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>;
	defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>;

	defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>;
	defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>;

	defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>;
	defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>;
	defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>;
	defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>;
	defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>;
	defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;

	def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
	def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
	defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
	defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;

	defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>;
	defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
	defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
	defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>;
	defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>;
	defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>;
	defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>;
	defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
	defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
	defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
	defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
	defm ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs", null_frag>;
	defm ORNS_PPzPP : sve_int_pred_log<0b1101, "orns", null_frag>;
	defm NORS_PPzPP : sve_int_pred_log<0b1110, "nors", null_frag>;
	defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", null_frag>;

	defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta", AArch64clasta_n>;
	defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb", AArch64clastb_n>;
	defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta", AArch64clasta_n>;
	defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb", AArch64clastb_n>;
	defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
	defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<bf16, AArch64clasta_n, nxv8i1, bf16, nxv8bf16, CLASTA_VPZ_H>;
	def : SVE_3_Op_Pat<bf16, AArch64clastb_n, nxv8i1, bf16, nxv8bf16, CLASTB_VPZ_H>;
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
	}

	defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
	defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
	defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
	defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
	def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
	}

	// continuous load with reg+immediate
	defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
	defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
	defm LD1B_S_IMM : sve_mem_cld_si<0b0010, "ld1b", Z_s, ZPR32>;
	defm LD1B_D_IMM : sve_mem_cld_si<0b0011, "ld1b", Z_d, ZPR64>;
	defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
	defm LD1H_IMM : sve_mem_cld_si<0b0101, "ld1h", Z_h, ZPR16>;
	defm LD1H_S_IMM : sve_mem_cld_si<0b0110, "ld1h", Z_s, ZPR32>;
	defm LD1H_D_IMM : sve_mem_cld_si<0b0111, "ld1h", Z_d, ZPR64>;
	defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
	defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
	defm LD1W_IMM : sve_mem_cld_si<0b1010, "ld1w", Z_s, ZPR32>;
	defm LD1W_D_IMM : sve_mem_cld_si<0b1011, "ld1w", Z_d, ZPR64>;
	defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
	defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
	defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
	defm LD1D_IMM : sve_mem_cld_si<0b1111, "ld1d", Z_d, ZPR64>;

	// LD1R loads (splat scalar to vector)
	defm LD1RB_IMM : sve_mem_ld_dup<0b00, 0b00, "ld1rb", Z_b, ZPR8, uimm6s1>;
	defm LD1RB_H_IMM : sve_mem_ld_dup<0b00, 0b01, "ld1rb", Z_h, ZPR16, uimm6s1>;
	defm LD1RB_S_IMM : sve_mem_ld_dup<0b00, 0b10, "ld1rb", Z_s, ZPR32, uimm6s1>;
	defm LD1RB_D_IMM : sve_mem_ld_dup<0b00, 0b11, "ld1rb", Z_d, ZPR64, uimm6s1>;
	defm LD1RSW_IMM : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
	defm LD1RH_IMM : sve_mem_ld_dup<0b01, 0b01, "ld1rh", Z_h, ZPR16, uimm6s2>;
	defm LD1RH_S_IMM : sve_mem_ld_dup<0b01, 0b10, "ld1rh", Z_s, ZPR32, uimm6s2>;
	defm LD1RH_D_IMM : sve_mem_ld_dup<0b01, 0b11, "ld1rh", Z_d, ZPR64, uimm6s2>;
	defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
	defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
	defm LD1RW_IMM : sve_mem_ld_dup<0b10, 0b10, "ld1rw", Z_s, ZPR32, uimm6s4>;
	defm LD1RW_D_IMM : sve_mem_ld_dup<0b10, 0b11, "ld1rw", Z_d, ZPR64, uimm6s4>;
	defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
	defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
	defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
	defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>;

	// LD1RQ loads (load quadword-vector and splat to scalable vector)
	defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
	defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
	defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
	defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
	defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// continuous load with reg+reg addressing.
	defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm LD1B_S : sve_mem_cld_ss<0b0010, "ld1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm LD1B_D : sve_mem_cld_ss<0b0011, "ld1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm LD1H : sve_mem_cld_ss<0b0101, "ld1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LD1H_S : sve_mem_cld_ss<0b0110, "ld1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm LD1H_D : sve_mem_cld_ss<0b0111, "ld1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm LD1W : sve_mem_cld_ss<0b1010, "ld1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LD1W_D : sve_mem_cld_ss<0b1011, "ld1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// non-faulting continuous load with reg+immediate
	defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>;
	defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>;
	defm LDNF1B_S_IMM : sve_mem_cldnf_si<0b0010, "ldnf1b", Z_s, ZPR32>;
	defm LDNF1B_D_IMM : sve_mem_cldnf_si<0b0011, "ldnf1b", Z_d, ZPR64>;
	defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
	defm LDNF1H_IMM : sve_mem_cldnf_si<0b0101, "ldnf1h", Z_h, ZPR16>;
	defm LDNF1H_S_IMM : sve_mem_cldnf_si<0b0110, "ldnf1h", Z_s, ZPR32>;
	defm LDNF1H_D_IMM : sve_mem_cldnf_si<0b0111, "ldnf1h", Z_d, ZPR64>;
	defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
	defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
	defm LDNF1W_IMM : sve_mem_cldnf_si<0b1010, "ldnf1w", Z_s, ZPR32>;
	defm LDNF1W_D_IMM : sve_mem_cldnf_si<0b1011, "ldnf1w", Z_d, ZPR64>;
	defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
	defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
	defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
	defm LDNF1D_IMM : sve_mem_cldnf_si<0b1111, "ldnf1d", Z_d, ZPR64>;

	// First-faulting loads with reg+reg addressing.
	defm LDFF1B : sve_mem_cldff_ss<0b0000, "ldff1b", Z_b, ZPR8, GPR64shifted8>;
	defm LDFF1B_H : sve_mem_cldff_ss<0b0001, "ldff1b", Z_h, ZPR16, GPR64shifted8>;
	defm LDFF1B_S : sve_mem_cldff_ss<0b0010, "ldff1b", Z_s, ZPR32, GPR64shifted8>;
	defm LDFF1B_D : sve_mem_cldff_ss<0b0011, "ldff1b", Z_d, ZPR64, GPR64shifted8>;
	defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
	defm LDFF1H : sve_mem_cldff_ss<0b0101, "ldff1h", Z_h, ZPR16, GPR64shifted16>;
	defm LDFF1H_S : sve_mem_cldff_ss<0b0110, "ldff1h", Z_s, ZPR32, GPR64shifted16>;
	defm LDFF1H_D : sve_mem_cldff_ss<0b0111, "ldff1h", Z_d, ZPR64, GPR64shifted16>;
	defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
	defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
	defm LDFF1W : sve_mem_cldff_ss<0b1010, "ldff1w", Z_s, ZPR32, GPR64shifted32>;
	defm LDFF1W_D : sve_mem_cldff_ss<0b1011, "ldff1w", Z_d, ZPR64, GPR64shifted32>;
	defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
	defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
	defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
	defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;

	// LD(2\|3\|4) structured loads with reg+immediate
	defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
	defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
	defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
	defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h, "ld2h", simm4s2>;
	defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h, "ld3h", simm4s3>;
	defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
	defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s, "ld2w", simm4s2>;
	defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s, "ld3w", simm4s3>;
	defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
	defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d, "ld2d", simm4s2>;
	defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d, "ld3d", simm4s3>;
	defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;

	// LD(2\|3\|4) structured loads (register + register)
	def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b, "ld2b", GPR64NoXZRshifted8>;
	def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b, "ld3b", GPR64NoXZRshifted8>;
	def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
	def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h, "ld2h", GPR64NoXZRshifted16>;
	def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h, "ld3h", GPR64NoXZRshifted16>;
	def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
	def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s, "ld2w", GPR64NoXZRshifted32>;
	def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s, "ld3w", GPR64NoXZRshifted32>;
	def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
	def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
	def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
	def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;

	// Gathers using unscaled 32-bit offsets, e.g.
	// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
	defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
	defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;

	// Gathers using scaled 32-bit offsets, e.g.
	// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
	defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
	defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;

	// Gathers using 32-bit pointers with scaled offset, e.g.
	// ld1h z0.s, p0/z, [z0.s, #16]
	defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>;
	defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>;
	defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>;
	defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>;
	defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>;
	defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>;
	defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>;
	defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>;
	defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>;

	// Gathers using 64-bit pointers with scaled offset, e.g.
	// ld1h z0.d, p0/z, [z0.d, #16]
	defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>;
	defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>;
	defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>;
	defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>;

	// Gathers using unscaled 64-bit offsets, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d]
	defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>;
	defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>;
	defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>;
	defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>;

	// Gathers using scaled 64-bit offsets, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
	defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;

	// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
	defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;

	// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
	defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
	defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;

	// Non-temporal contiguous loads (register + immediate)
	defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
	defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
	defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
	defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;

	// Non-temporal contiguous loads (register + register)
	defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// contiguous store with immediates
	defm ST1B_IMM : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
	defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
	defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
	defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
	defm ST1H_IMM : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
	defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
	defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
	defm ST1W_IMM : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
	defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
	defm ST1D_IMM : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;

	// contiguous store with reg+reg addressing.
	defm ST1B : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm ST1H : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// Scatters using unpacked, unscaled 32-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, uxtw]
	defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
	defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
	defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
	defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;

	// Scatters using packed, unscaled 32-bit offsets, e.g.
	// st1h z0.s, p0, [x0, z0.s, uxtw]
	defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
	defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
	defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;

	// Scatters using packed, scaled 32-bit offsets, e.g.
	// st1h z0.s, p0, [x0, z0.s, uxtw #1]
	defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
	defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;

	// Scatters using unpacked, scaled 32-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, uxtw #1]
	defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
	defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
	defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;

	// Scatters using 32/64-bit pointers with offset, e.g.
	// st1h z0.s, p0, [z0.s, #16]
	defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>;
	defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>;
	defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>;

	// Scatters using 32/64-bit pointers with offset, e.g.
	// st1h z0.d, p0, [z0.d, #16]
	defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>;
	defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>;
	defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>;
	defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>;

	// Scatters using unscaled 64-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d]
	defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>;
	defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>;
	defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
	defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;

	// Scatters using scaled 64-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, lsl #1]
	defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
	defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
	defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;

	// ST(2\|3\|4) structured stores (register + immediate)
	defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
	defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
	defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
	defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h, "st2h", simm4s2>;
	defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h, "st3h", simm4s3>;
	defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
	defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s, "st2w", simm4s2>;
	defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s, "st3w", simm4s3>;
	defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
	defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d, "st2d", simm4s2>;
	defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d, "st3d", simm4s3>;
	defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;

	// ST(2\|3\|4) structured stores (register + register)
	def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b, "st2b", GPR64NoXZRshifted8>;
	def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b, "st3b", GPR64NoXZRshifted8>;
	def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
	def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h, "st2h", GPR64NoXZRshifted16>;
	def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h, "st3h", GPR64NoXZRshifted16>;
	def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
	def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s, "st2w", GPR64NoXZRshifted32>;
	def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s, "st3w", GPR64NoXZRshifted32>;
	def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
	def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d, "st2d", GPR64NoXZRshifted64>;
	def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d, "st3d", GPR64NoXZRshifted64>;
	def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;

	// Non-temporal contiguous stores (register + immediate)
	defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
	defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
	defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
	defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;

	// Non-temporal contiguous stores (register + register)
	defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// Fill/Spill
	defm LDR_ZXI : sve_mem_z_fill<"ldr">;
	defm LDR_PXI : sve_mem_p_fill<"ldr">;
	defm STR_ZXI : sve_mem_z_spill<"str">;
	defm STR_PXI : sve_mem_p_spill<"str">;

	// Contiguous prefetch (register + immediate)
	defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
	defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
	defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
	defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;

	// Contiguous prefetch (register + register)
	def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
	def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
	def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
	def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;

	multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
	// reg + imm
	let AddedComplexity = 2 in {
	def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
	(RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>;
	}

	// reg + reg
	let AddedComplexity = 1 in {
	def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)),
	(RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>;
	}

	// default fallback
	def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)),
	(RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>;
	}

	defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>;
	defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>;
	defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>;
	defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>;

	// Gather prefetch using scaled 32-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
	defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
	defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
	defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
	defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;

	// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
	defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
	defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
	defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
	defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;

	// Gather prefetch using scaled 64-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
	defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>;
	defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>;
	defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>;
	defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>;

	// Gather prefetch using 32/64-bit pointers with offset, e.g.
	// prfh pldl1keep, p0, [z0.s, #16]
	// prfh pldl1keep, p0, [z0.d, #16]
	defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
	defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
	defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
	defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;

	defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
	defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
	defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
	defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;

	defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
	defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
	defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
	defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;

	def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_0 $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_1 $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_2 $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)),
	(ADR_LSL_ZZZ_S_3 $Op1, $Op2)>;

	def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
	(ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;

	defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
	}

	defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
	defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
	defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
	defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2", AArch64uzp2>;
	defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
	defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
	def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
	}

	defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
	defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
	defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
	defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2", AArch64uzp2>;
	defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
	defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;

	// Extract lo/hi halves of legal predicate types.
	def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
	(ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
	def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
	(ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
	def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
	(ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
	def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
	(ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
	def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
	(ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
	def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
	(ZIP2_PPP_B PPR:$Ps, (PFALSE))>;

	// Concatenate two predicates.
	def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
	(UZP1_PPP_S $p1, $p2)>;
	def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)),
	(UZP1_PPP_H $p1, $p2)>;
	def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
	(UZP1_PPP_B $p1, $p2)>;

	defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
	defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
	defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
	defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>;
	defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>;
	defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>;

	defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>;
	defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>;
	defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge", int_aarch64_sve_cmpge_wide>;
	defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt", int_aarch64_sve_cmpgt_wide>;
	defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt", int_aarch64_sve_cmplt_wide>;
	defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple", int_aarch64_sve_cmple_wide>;
	defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs", int_aarch64_sve_cmphs_wide>;
	defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi", int_aarch64_sve_cmphi_wide>;
	defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>;
	defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>;

	defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>;
	defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>;
	defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>;
	defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>;
	defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>;
	defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>;
	defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>;
	defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>;
	defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
	defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;

	defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
	defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
	defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
	defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
	defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
	defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
	defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;

	defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
	defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
	defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
	defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
	defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
	defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;

	defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
	defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
	defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
	defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>;

	defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
	defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>;
	defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
	defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>;

	def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
	def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
	def CTERMEQ_XX : sve_int_cterm<0b1, 0b0, "ctermeq", GPR64>;
	def CTERMNE_XX : sve_int_cterm<0b1, 0b1, "ctermne", GPR64>;

	def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
	def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
	def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;

	defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>;
	defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>;
	defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
	defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
	defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;

	defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
	defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
	defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
	defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
	defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
	defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
	defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
	defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;

	defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
	defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
	defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
	defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb", int_aarch64_sve_uqdecb_n32>;
	defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb", int_aarch64_sve_sqincb_n64>;
	defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb", int_aarch64_sve_uqincb_n64>;
	defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb", int_aarch64_sve_sqdecb_n64>;
	defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb", int_aarch64_sve_uqdecb_n64>;

	defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch", int_aarch64_sve_sqinch_n32>;
	defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch", int_aarch64_sve_uqinch_n32>;
	defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech", int_aarch64_sve_sqdech_n32>;
	defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech", int_aarch64_sve_uqdech_n32>;
	defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch", int_aarch64_sve_sqinch_n64>;
	defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch", int_aarch64_sve_uqinch_n64>;
	defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech", int_aarch64_sve_sqdech_n64>;
	defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech", int_aarch64_sve_uqdech_n64>;

	defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw", int_aarch64_sve_sqincw_n32>;
	defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw", int_aarch64_sve_uqincw_n32>;
	defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw", int_aarch64_sve_sqdecw_n32>;
	defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw", int_aarch64_sve_uqdecw_n32>;
	defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw", int_aarch64_sve_sqincw_n64>;
	defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw", int_aarch64_sve_uqincw_n64>;
	defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw", int_aarch64_sve_sqdecw_n64>;
	defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw", int_aarch64_sve_uqdecw_n64>;

	defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd", int_aarch64_sve_sqincd_n32>;
	defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd", int_aarch64_sve_uqincd_n32>;
	defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd", int_aarch64_sve_sqdecd_n32>;
	defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd", int_aarch64_sve_uqdecd_n32>;
	defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd", int_aarch64_sve_sqincd_n64>;
	defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd", int_aarch64_sve_uqincd_n64>;
	defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd", int_aarch64_sve_sqdecd_n64>;
	defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd", int_aarch64_sve_uqdecd_n64>;

	defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16, int_aarch64_sve_sqinch, nxv8i16>;
	defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16, int_aarch64_sve_uqinch, nxv8i16>;
	defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16, int_aarch64_sve_sqdech, nxv8i16>;
	defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16, int_aarch64_sve_uqdech, nxv8i16>;
	defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
	defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
	defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32, int_aarch64_sve_sqincw, nxv4i32>;
	defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32, int_aarch64_sve_uqincw, nxv4i32>;
	defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32, int_aarch64_sve_sqdecw, nxv4i32>;
	defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32, int_aarch64_sve_uqdecw, nxv4i32>;
	defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
	defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
	defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64, int_aarch64_sve_sqincd, nxv2i64>;
	defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64, int_aarch64_sve_uqincd, nxv2i64>;
	defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64, int_aarch64_sve_sqdecd, nxv2i64>;
	defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64, int_aarch64_sve_uqdecd, nxv2i64>;
	defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
	defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;

	defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp", int_aarch64_sve_sqincp_n32>;
	defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp", int_aarch64_sve_sqincp_n64>;
	defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp", int_aarch64_sve_uqincp_n32>;
	defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp", int_aarch64_sve_uqincp_n64>;
	defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp", int_aarch64_sve_sqdecp_n32>;
	defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>;
	defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>;
	defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>;
	defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
	defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;

	defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>;
	defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>;
	defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp", int_aarch64_sve_sqdecp>;
	defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp", int_aarch64_sve_uqdecp>;
	defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
	defm DECP_ZP : sve_int_count_v<0b10100, "decp">;

	defm INDEX_RR : sve_int_index_rr<"index", index_vector>;
	defm INDEX_IR : sve_int_index_ir<"index", index_vector>;
	defm INDEX_RI : sve_int_index_ri<"index", index_vector>;
	defm INDEX_II : sve_int_index_ii<"index", index_vector>;

	// Unpredicated shifts
	defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
	defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
	defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;

	defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
	defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
	defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;

	// Predicated shifts
	defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
	defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
	defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
	defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;

	let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
	defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
	defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
	defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
	defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
	}

	defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
	defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
	defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
	defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /isReverseInstr/ 1>;
	defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /isReverseInstr/ 1>;
	defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /isReverseInstr/ 1>;

	defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
	defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
	defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;

	defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
	defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
	defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
	defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
	defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
	defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
	defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
	defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
	defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
	defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
	defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
	defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
	defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
	defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
	defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
	defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
	defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
	defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
	defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
	defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
	defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
	defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
	defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
	defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
	defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
	defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
	defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
	defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
	defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
	defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;

	defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
	defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
	defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>;
	defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>;
	defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>;
	defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>;
	defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>;
	defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
	defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>;

	let Predicates = [HasBF16, HasSVE] in {
	defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
	defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
	defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
	defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
	defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
	defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
	defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
	defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
	defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
	}

	// InstAliases
	def : InstAlias<"mov $Zd, $Zn",
	(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
	def : InstAlias<"mov $Pd, $Pg/m, $Pn",
	(SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
	def : InstAlias<"mov $Pd, $Pn",
	(ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
	def : InstAlias<"mov $Pd, $Pg/z, $Pn",
	(AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;

	def : InstAlias<"movs $Pd, $Pn",
	(ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
	def : InstAlias<"movs $Pd, $Pg/z, $Pn",
	(ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;

	def : InstAlias<"not $Pd, $Pg/z, $Pn",
	(EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;

	def : InstAlias<"nots $Pd, $Pg/z, $Pn",
	(EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;

	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	// Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4.
	// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
	// AArch64ExpandPseudoInsts.
	let mayLoad = 1, hasSideEffects = 0 in {
	def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	}
	let mayStore = 1, hasSideEffects = 0 in {
	def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
	}

	def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;
	def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;
	def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;
	def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
	(PTEST_PP PPR:$pg, PPR:$src)>;

	// LD1R of 128-bit masked data
	def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_B_IMM $gp, $base, (i64 0))>;
	def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_H_IMM $gp, $base, (i64 0))>;
	def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_W_IMM $gp, $base, (i64 0))>;
	def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
	(LD1RQ_D_IMM $gp, $base, (i64 0))>;

	def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
	def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
	def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
	def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
	(LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;

	def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
	def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;

	// General case that we ideally never want to match.
	def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;

	let AddedComplexity = 5 in {
	def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>;
	def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>;

	def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>;
	def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>;
	def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>;
	def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>;

	def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>;
	def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>;
	def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
	}

	// FIXME: BigEndian requires an additional REV instruction to satisfy the
	// constraint that none of the bits change when stored to memory as one
	// type, and and reloaded as another type.
	let Predicates = [IsLE] in {
	def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;

	def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;

	def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;

	def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;

	def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;

	def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;

	def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;

	}

	let Predicates = [IsLE, HasBF16, HasSVE] in {
	def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	}

	let Predicates = [IsLE, HasSVE, HasBF16] in {
	def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
	def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;

	def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
	def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
	def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
	def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
	def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
	def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
	def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
	}

	def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
	def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;

	def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
	def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>;
	def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>;
	def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)),
	(AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;

	// Add more complex addressing modes here as required
	multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
	Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
	(RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
	}
	// reg + imm
	let AddedComplexity = 2 in {
	def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
	(RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
	}
	def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
	(RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
	}

	// 2-element contiguous loads
	defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
	defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;

	// 4-element contiguous loads
	defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
	defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;

	// 8-element contiguous loads
	defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
	defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
	defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous loads
	defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;

	multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
	Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
	(RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
	}
	// reg + imm
	let AddedComplexity = 2 in {
	def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
	}
	def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
	}

	// 2-element contiguous stores
	defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
	defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
	defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
	defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;

	// 4-element contiguous stores
	defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
	defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
	defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;

	// 8-element contiguous stores
	defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
	defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
	defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous stores
	defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;

	defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
	defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
	defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
	defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;

	defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
	defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
	defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
	defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;

	multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
	Instruction PTrue> {
	let AddedComplexity = 1 in {
	def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
	(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}
	let AddedComplexity = 2 in {
	def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
	(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}

	def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
	(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
	}

	defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>;
	defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
	defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
	defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>;
	defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
	defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>;
	defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>;
	defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>;
	defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
	defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>;
	- defm : unpred_store< store, nxv4f32, ST1W_D_IMM, PTRUE_D>;
	+ defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>;
	defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>;

	multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
	Instruction PTrue> {
	let AddedComplexity = 1 in {
	def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
	(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}

	let AddedComplexity = 2 in {
	def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
	(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
	}

	def : Pat<(Ty (Load GPR64:$base)),
	(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
	}

	defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>;
	defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
	defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
	defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
	defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
	defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
	defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
	defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
	defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
	defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>;
	defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
	defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
	defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
	defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
	defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
	defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>;
	defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
	defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
	defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>;
	defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>;
	defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>;
	defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>;
	defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>;
	defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>;

	multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
	def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
	(Store PPR:$val, GPR64sp:$base, simm9:$offset)>;

	def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
	(Store PPR:$Val, GPR64:$base, (i64 0))>;
	}

	defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
	defm Pat_Store_P8 : unpred_store_predicate<nxv8i1, STR_PXI>;
	defm Pat_Store_P4 : unpred_store_predicate<nxv4i1, STR_PXI>;
	defm Pat_Store_P2 : unpred_store_predicate<nxv2i1, STR_PXI>;

	multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
	def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
	(Load GPR64sp:$base, simm9:$offset)>;

	def _default : Pat<(Ty (load GPR64:$base)),
	(Load GPR64:$base, (i64 0))>;
	}

	defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
	defm Pat_Load_P8 : unpred_load_predicate<nxv8i1, LDR_PXI>;
	defm Pat_Load_P4 : unpred_load_predicate<nxv4i1, LDR_PXI>;
	defm Pat_Load_P2 : unpred_load_predicate<nxv2i1, LDR_PXI>;

	multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
	SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
	(RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
	}

	// scalar + immediate (mul vl)
	let AddedComplexity = 2 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
	(RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
	}

	// base
	def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
	(RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>;
	}

	// 2-element contiguous loads
	defm : ld1<LD1B_D, LD1B_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1SB_D, LD1SB_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1H_D, LD1H_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1SH_D, LD1SH_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1W_D, LD1W_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ld1<LD1SW_D, LD1SW_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ld1<LD1D, LD1D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
	defm : ld1<LD1D, LD1D_IMM, nxv2f64, AArch64ld1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;

	// 4-element contiguous loads
	defm : ld1<LD1B_S, LD1B_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1SB_S, LD1SB_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1H_S, LD1H_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1SH_S, LD1SH_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1W, LD1W_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
	defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;

	// 8-element contiguous loads
	defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
	defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous loads
	defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

	multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
	// scalar + immediate (mul vl)
	let AddedComplexity = 1 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
	(I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
	}

	// base
	def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
	(I PPR:$gp, GPR64sp:$base, (i64 0))>;
	}

	// 2-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i8>;
	defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i8>;
	defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i16>;
	defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i16>;
	defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i32>;
	defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i32>;
	defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i64>;
	defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1_z, nxv2i1, nxv2f64>;

	// 4-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i8>;
	defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i8>;
	defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i16>;
	defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i16>;
	defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i32>;
	defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1_z, nxv4i1, nxv4f32>;

	// 8-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i8>;
	defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>;
	defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>;
	defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>;
	}

	// 16-element contiguous non-faulting loads
	defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>;

	multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
	(I PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
	}

	// Base
	def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
	(I PPR:$gp, GPR64sp:$base, XZR)>;
	}

	// 2-element contiguous first faulting loads
	defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
	defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1_z, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;

	// 4-element contiguous first faulting loads
	defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
	defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;

	// 8-element contiguous first faulting loads
	defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
	defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;

	let Predicates = [HasBF16, HasSVE] in {
	defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
	}

	// 16-element contiguous first faulting loads
	defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

	multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
	SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
	// reg + reg
	let AddedComplexity = 1 in {
	def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT),
	(RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
	}

	// scalar + immediate (mul vl)
	let AddedComplexity = 2 in {
	def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
	}

	// base
	def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT),
	(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
	}

	// 2-element contiguous store
	defm : st1<ST1B_D, ST1B_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
	defm : st1<ST1H_D, ST1H_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
	defm : st1<ST1W_D, ST1W_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
	defm : st1<ST1D, ST1D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;

	// 4-element contiguous store
	defm : st1<ST1B_S, ST1B_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
	defm : st1<ST1H_S, ST1H_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
	defm : st1<ST1W, ST1W_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;

	// 8-element contiguous store
	defm : st1<ST1B_H, ST1B_H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
	defm : st1<ST1H, ST1H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;

	// 16-element contiguous store
	defm : st1<ST1B, ST1B_IMM, nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;

	def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
	(INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
	def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
	(INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
	def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
	(INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
	def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
	(INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;

	// Insert scalar into vector[0]
	def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
	(CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
	def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
	(CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
	def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
	(CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
	def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
	(CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;

	def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
	(SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
	def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
	(SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
	def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
	(SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;

	// Insert scalar into vector with scalar index
	def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
	(CPY_ZPmR_B ZPR:$vec,
	(CMPEQ_PPzZZ_B (PTRUE_B 31),
	(INDEX_II_B 0, 1),
	(DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	GPR32:$src)>;
	def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
	(CPY_ZPmR_H ZPR:$vec,
	(CMPEQ_PPzZZ_H (PTRUE_H 31),
	(INDEX_II_H 0, 1),
	(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	GPR32:$src)>;
	def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
	(CPY_ZPmR_S ZPR:$vec,
	(CMPEQ_PPzZZ_S (PTRUE_S 31),
	(INDEX_II_S 0, 1),
	(DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	GPR32:$src)>;
	def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
	(CPY_ZPmR_D ZPR:$vec,
	(CMPEQ_PPzZZ_D (PTRUE_D 31),
	(INDEX_II_D 0, 1),
	(DUP_ZR_D GPR64:$index)),
	GPR64:$src)>;

	// Insert FP scalar into vector with scalar index
	def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
	(CPY_ZPmV_H ZPR:$vec,
	(CMPEQ_PPzZZ_H (PTRUE_H 31),
	(INDEX_II_H 0, 1),
	(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	$src)>;
	def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
	(CPY_ZPmV_S ZPR:$vec,
	(CMPEQ_PPzZZ_S (PTRUE_S 31),
	(INDEX_II_S 0, 1),
	(DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
	$src)>;
	def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
	(CPY_ZPmV_D ZPR:$vec,
	(CMPEQ_PPzZZ_D (PTRUE_D 31),
	(INDEX_II_D 0, 1),
	(DUP_ZR_D $index)),
	$src)>;

	// Extract element from vector with immediate index
	def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
	def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
	def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
	def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
	def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
	def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
	def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
	(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;

	// Extract element from vector with scalar index
	def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
	(LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
	ZPR:$vec)>;

	def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
	(LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
	(LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
	ZPR:$vec)>;
	def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
	(LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
	ZPR:$vec)>;
	}

	let Predicates = [HasSVE, HasMatMulInt8] in {
	defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
	defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>;
	defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
	defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
	defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
	defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
	}

	let Predicates = [HasSVE, HasMatMulFP32] in {
	defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
	}

	let Predicates = [HasSVE, HasMatMulFP64] in {
	defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>;
	defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>;
	defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>;
	defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>;
	defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>;
	defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>;
	defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>;
	defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>;
	defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>;
	defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
	defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
	defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
	defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
	defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
	defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
	}

	let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
	def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
	}

	let Predicates = [HasSVE2] in {
	// SVE2 integer multiply-add (indexed)
	defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
	defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;

	// SVE2 saturating multiply-add high (indexed)
	defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>;
	defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>;

	// SVE2 saturating multiply-add high (vectors, unpredicated)
	defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>;
	defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>;

	// SVE2 integer multiply (indexed)
	defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>;

	// SVE2 saturating multiply high (indexed)
	defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>;
	defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>;

	// SVE2 signed saturating doubling multiply high (unpredicated)
	defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>;
	defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;

	// SVE2 integer multiply vectors (unpredicated)
	defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>;
	defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>;
	defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>;
	defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;

	// Add patterns for unpredicated version of smulh and umulh.
	def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
	(SMULH_ZZZ_B $Op1, $Op2)>;
	def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
	(SMULH_ZZZ_H $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
	(SMULH_ZZZ_S $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
	(SMULH_ZZZ_D $Op1, $Op2)>;
	def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
	(UMULH_ZZZ_B $Op1, $Op2)>;
	def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
	(UMULH_ZZZ_H $Op1, $Op2)>;
	def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
	(UMULH_ZZZ_S $Op1, $Op2)>;
	def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
	(UMULH_ZZZ_D $Op1, $Op2)>;
	// SVE2 complex integer dot product (indexed)
	defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;

	// SVE2 complex integer dot product
	defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>;

	// SVE2 complex integer multiply-add (indexed)
	defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>;
	// SVE2 complex saturating multiply-add (indexed)
	defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>;

	// SVE2 complex integer multiply-add
	defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>;
	defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;

	// SVE2 integer multiply long (indexed)
	defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
	defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
	defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
	defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;

	// SVE2 saturating multiply (indexed)
	defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
	defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;

	// SVE2 integer multiply-add long (indexed)
	defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
	defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>;
	defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>;
	defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>;
	defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>;
	defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>;
	defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>;
	defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>;

	// SVE2 integer multiply-add long (vectors, unpredicated)
	defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>;
	defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>;
	defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>;
	defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>;
	defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>;
	defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>;
	defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
	defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;

	// SVE2 saturating multiply-add long (indexed)
	defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
	defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
	defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>;
	defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>;

	// SVE2 saturating multiply-add long (vectors, unpredicated)
	defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>;
	defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>;
	defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>;
	defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>;

	// SVE2 saturating multiply-add interleaved long
	defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>;
	defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;

	// SVE2 integer halving add/subtract (predicated)
	defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>;
	defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>;
	defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>;
	defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>;
	defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
	defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
	defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
	defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;

	// SVE2 integer pairwise add and accumulate long
	defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>;
	defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>;

	// SVE2 integer pairwise arithmetic
	defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>;
	defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>;
	defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>;
	defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>;
	defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>;

	// SVE2 integer unary operations (predicated)
	defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>;
	defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>;
	defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>;
	defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>;

	// SVE2 saturating add/subtract
	defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>;
	defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>;
	defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>;
	defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>;
	defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>;
	defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>;
	defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>;
	defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;

	// SVE2 saturating/rounding bitwise shift left (predicated)
	defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>;
	defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>;
	defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>;
	defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>;
	defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>;
	defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>;
	defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>;
	defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>;
	defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>;
	defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>;
	defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
	defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;

	let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
	defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
	defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
	defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
	defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
	defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
	}

	// SVE2 predicated shifts
	defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
	defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
	defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
	defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
	defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;

	// SVE2 integer add/subtract long
	defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
	defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>;
	defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>;
	defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>;
	defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>;
	defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>;
	defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>;
	defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>;
	defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>;
	defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>;
	defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>;
	defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;

	// SVE2 integer add/subtract wide
	defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
	defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
	defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
	defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
	defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
	defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
	defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
	defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;

	// SVE2 integer multiply long
	defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
	defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
	defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>;
	defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>;
	defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>;
	defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>;
	defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>;
	defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>;

	// SVE2 bitwise shift and insert
	defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>;
	defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;

	// SVE2 bitwise shift right and accumulate
	defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>;
	defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>;
	defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
	defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;

	// SVE2 complex integer add
	defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>;
	defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;

	// SVE2 integer absolute difference and accumulate
	defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
	defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;

	// SVE2 integer absolute difference and accumulate long
	defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
	defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>;
	defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>;
	defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>;

	// SVE2 integer add/subtract long with carry
	defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>;
	defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>;
	defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>;
	defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>;

	// SVE2 bitwise shift right narrow (bottom)
	defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>;
	defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>;
	defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>;
	defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>;
	defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>;
	defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>;
	defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>;
	defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb", int_aarch64_sve_uqrshrnb>;

	// SVE2 bitwise shift right narrow (top)
	defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt", int_aarch64_sve_sqshrunt>;
	defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>;
	defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt", int_aarch64_sve_shrnt>;
	defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt", int_aarch64_sve_rshrnt>;
	defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt", int_aarch64_sve_sqshrnt>;
	defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt", int_aarch64_sve_sqrshrnt>;
	defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt", int_aarch64_sve_uqshrnt>;
	defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt", int_aarch64_sve_uqrshrnt>;

	// SVE2 integer add/subtract narrow high part (bottom)
	defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb", int_aarch64_sve_addhnb>;
	defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb", int_aarch64_sve_raddhnb>;
	defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb", int_aarch64_sve_subhnb>;
	defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb", int_aarch64_sve_rsubhnb>;

	// SVE2 integer add/subtract narrow high part (top)
	defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt", int_aarch64_sve_addhnt>;
	defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt", int_aarch64_sve_raddhnt>;
	defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt", int_aarch64_sve_subhnt>;
	defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt", int_aarch64_sve_rsubhnt>;

	// SVE2 saturating extract narrow (bottom)
	defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb", int_aarch64_sve_sqxtnb>;
	defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb", int_aarch64_sve_uqxtnb>;
	defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb", int_aarch64_sve_sqxtunb>;

	// SVE2 saturating extract narrow (top)
	defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>;
	defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>;
	defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;

	// SVE2 character match
	defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>;
	defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;

	// SVE2 bitwise exclusive-or interleaved
	defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
	defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;

	// SVE2 bitwise shift left long
	defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
	defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
	defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
	defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;

	// SVE2 integer add/subtract interleaved long
	defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
	defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
	defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;

	// SVE2 histogram generation (segment)
	def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>;

	// SVE2 histogram generation (vector)
	defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;

	// SVE2 floating-point base 2 logarithm as integer
	defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;

	// SVE2 floating-point convert precision
	defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
	defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">;
	defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">;
	defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">;

	// SVE2 floating-point pairwise operations
	defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>;
	defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>;
	defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>;
	defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>;
	defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>;

	// SVE2 floating-point multiply-add long (indexed)
	defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>;
	defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>;
	defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>;
	defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>;

	// SVE2 floating-point multiply-add long
	defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>;
	defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>;
	defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>;
	defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;

	// SVE2 bitwise ternary operations
	defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>;
	defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>;
	defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>;
	defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
	defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
	defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>;

	// SVE2 bitwise xor and rotate right by immediate
	defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;

	// SVE2 extract vector (immediate offset, constructive)
	def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;

	// SVE2 non-temporal gather loads
	defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>;
	defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>;
	defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>;
	defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>;
	defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>;

	defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>;
	defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>;
	defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>;
	defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>;
	defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>;
	defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>;
	defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>;

	// SVE2 vector splice (constructive)
	defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;

	// SVE2 non-temporal scatter stores
	defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
	defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
	defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;

	defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
	defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
	defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
	defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;

	// SVE2 table lookup (three sources)
	defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
	defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;

	let Predicates = [HasSVE, HasBF16] in {
	def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
	def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
	(nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
	nxv8i16:$Op3))>;
	}

	// SVE2 integer compare scalar count and limit
	defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
	defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
	defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
	defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;

	defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege>;
	defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
	defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
	defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;

	// SVE2 pointer conflict compare
	defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
	defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
	}

	let Predicates = [HasSVE2AES] in {
	// SVE2 crypto destructive binary operations
	defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
	defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;

	// SVE2 crypto unary operations
	defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>;
	defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>;

	// PMULLB and PMULLT instructions which operate with 64-bit source and
	// 128-bit destination elements are enabled with crypto extensions, similar
	// to NEON PMULL2 instruction.
	defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>;
	defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>;
	}

	let Predicates = [HasSVE2SM4] in {
	// SVE2 crypto constructive binary operations
	defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
	// SVE2 crypto destructive binary operations
	defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
	}

	let Predicates = [HasSVE2SHA3] in {
	// SVE2 crypto constructive binary operations
	defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
	}

	let Predicates = [HasSVE2BitPerm] in {
	// SVE2 bitwise permute
	defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
	defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
	defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
	index 6fa1c744f77e..24751a81797d 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
	@@ -1,139 +1,151 @@
	//==--AArch64StackOffset.h ---------------------------------------- C++ --==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the declaration of the StackOffset class, which is used to
	// describe scalable and non-scalable offsets during frame lowering.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
	#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H

	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/TypeSize.h"
	#include <cassert>

	namespace llvm {

	/// StackOffset is a wrapper around scalable and non-scalable offsets and is
	/// used in several functions such as 'isAArch64FrameOffsetLegal' and
	/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g.
	//
	/// StackOffset(1, MVT::nxv16i8)
	//
	/// would describe an offset as being the size of a single SVE vector.
	///
	/// The class also implements simple arithmetic (addition/subtraction) on these
	/// offsets, e.g.
	//
	/// StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64)
	//
	/// describes an offset that spans the combined storage required for an SVE
	/// vector and a 64bit GPR.
	class StackOffset {
	int64_t Bytes;
	int64_t ScalableBytes;

	explicit operator int() const;

	public:
	using Part = std::pair<int64_t, MVT>;

	StackOffset() : Bytes(0), ScalableBytes(0) {}

	StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
	assert(MVT(T).isByteSized() && "Offset type is not a multiple of bytes");
	*this += Part(Offset, T);
	}

	StackOffset(const StackOffset &Other)
	: Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}

	StackOffset &operator=(const StackOffset &) = default;

	StackOffset &operator+=(const StackOffset::Part &Other) {
	const TypeSize Size = Other.second.getSizeInBits();
	if (Size.isScalable())
	ScalableBytes += Other.first * ((int64_t)Size.getKnownMinSize() / 8);
	else
	Bytes += Other.first * ((int64_t)Size.getFixedSize() / 8);
	return *this;
	}

	StackOffset &operator+=(const StackOffset &Other) {
	Bytes += Other.Bytes;
	ScalableBytes += Other.ScalableBytes;
	return *this;
	}

	StackOffset operator+(const StackOffset &Other) const {
	StackOffset Res(*this);
	Res += Other;
	return Res;
	}

	StackOffset &operator-=(const StackOffset &Other) {
	Bytes -= Other.Bytes;
	ScalableBytes -= Other.ScalableBytes;
	return *this;
	}

	StackOffset operator-(const StackOffset &Other) const {
	StackOffset Res(*this);
	Res -= Other;
	return Res;
	}

	StackOffset operator-() const {
	StackOffset Res = {};
	const StackOffset Other(*this);
	Res -= Other;
	return Res;
	}

	/// Returns the scalable part of the offset in bytes.
	int64_t getScalableBytes() const { return ScalableBytes; }

	/// Returns the non-scalable part of the offset in bytes.
	int64_t getBytes() const { return Bytes; }

	/// Returns the offset in parts to which this frame offset can be
	/// decomposed for the purpose of describing a frame offset.
	/// For non-scalable offsets this is simply its byte size.
	void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
	int64_t &NumDataVectors) const {
	assert(isValid() && "Invalid frame offset");

	NumBytes = Bytes;
	NumDataVectors = 0;
	NumPredicateVectors = ScalableBytes / 2;
	// This method is used to get the offsets to adjust the frame offset.
	// If the function requires ADDPL to be used and needs more than two ADDPL
	// instructions, part of the offset is folded into NumDataVectors so that it
	// uses ADDVL for part of it, reducing the number of ADDPL instructions.
	if (NumPredicateVectors % 8 == 0 \|\| NumPredicateVectors < -64 \|\|
	NumPredicateVectors > 62) {
	NumDataVectors = NumPredicateVectors / 8;
	NumPredicateVectors -= NumDataVectors * 8;
	}
	}

	+ void getForDwarfOffset(int64_t &ByteSized, int64_t &VGSized) const {
	+ assert(isValid() && "Invalid frame offset");
	+
	+ // VGSized offsets are divided by '2', because the VG register is the
	+ // the number of 64bit granules as opposed to 128bit vector chunks,
	+ // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
	+ // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
	+ // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
	+ ByteSized = Bytes;
	+ VGSized = ScalableBytes / 2;
	+ }
	+
	/// Returns whether the offset is known zero.
	explicit operator bool() const { return Bytes \|\| ScalableBytes; }

	bool isValid() const {
	// The smallest scalable element supported by scaled SVE addressing
	// modes are predicates, which are 2 scalable bytes in size. So the scalable
	// byte offset must always be a multiple of 2.
	return ScalableBytes % 2 == 0;
	}
	};

	} // end namespace llvm

	#endif
	diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
	index 74fe0cdd1ea7..0245dd1d611a 100644
	--- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
	+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
	@@ -1,265 +1,265 @@
	//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Performs general IR level optimizations on SVE intrinsics.
	//
	// The main goal of this pass is to remove unnecessary reinterpret
	// intrinsics (llvm.aarch64.sve.convert.[to\|from].svbool), e.g:
	//
	// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
	// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
	//
	// This pass also looks for ptest intrinsics & phi instructions where the
	// operands are being needlessly converted to and from svbool_t.
	//
	//===----------------------------------------------------------------------===//

	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/IntrinsicsAArch64.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Support/Debug.h"

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "sve-intrinsic-opts"

	namespace llvm {
	void initializeSVEIntrinsicOptsPass(PassRegistry &);
	}

	namespace {
	struct SVEIntrinsicOpts : public ModulePass {
	static char ID; // Pass identification, replacement for typeid
	SVEIntrinsicOpts() : ModulePass(ID) {
	initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
	}

	bool runOnModule(Module &M) override;
	void getAnalysisUsage(AnalysisUsage &AU) const override;

	private:
	static IntrinsicInst isReinterpretToSVBool(Value V);

	static bool optimizeIntrinsic(Instruction *I);

	bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);

	static bool optimizeConvertFromSVBool(IntrinsicInst *I);
	static bool optimizePTest(IntrinsicInst *I);

	static bool processPhiNode(IntrinsicInst *I);
	};
	} // end anonymous namespace

	void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.setPreservesCFG();
	}

	char SVEIntrinsicOpts::ID = 0;
	static const char *name = "SVE intrinsics optimizations";
	INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
	INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)

	namespace llvm {
	ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
	} // namespace llvm

	/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
	/// otherwise.
	IntrinsicInst SVEIntrinsicOpts::isReinterpretToSVBool(Value V) {
	IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
	if (!I)
	return nullptr;

	if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
	return nullptr;

	return I;
	}

	/// The function will remove redundant reinterprets casting in the presence
	/// of the control flow
	bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {

	SmallVector<Instruction *, 32> Worklist;
	auto RequiredType = X->getType();

	auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
	assert(PN && "Expected Phi Node!");

	// Don't create a new Phi unless we can remove the old one.
	if (!PN->hasOneUse())
	return false;

	for (Value *IncValPhi : PN->incoming_values()) {
	auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
	if (!Reinterpret \|\|
	RequiredType != Reinterpret->getArgOperand(0)->getType())
	return false;
	}

	// Create the new Phi
	LLVMContext &Ctx = PN->getContext();
	IRBuilder<> Builder(Ctx);
	Builder.SetInsertPoint(PN);
	PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
	Worklist.push_back(PN);

	for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
	auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
	NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
	Worklist.push_back(Reinterpret);
	}

	// Cleanup Phi Node and reinterprets
	X->replaceAllUsesWith(NPN);
	X->eraseFromParent();

	for (auto &I : Worklist)
	if (I->use_empty())
	I->eraseFromParent();

	return true;
	}

	bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
	IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
	IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));

	if (Op1 && Op2 &&
	Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
	Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
	Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {

	Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
	Type *Tys[] = {Op1->getArgOperand(0)->getType()};
	Module *M = I->getParent()->getParent()->getParent();

	auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
	auto CI = CallInst::Create(Fn, Ops, I->getName(), I);

	I->replaceAllUsesWith(CI);
	I->eraseFromParent();
	if (Op1->use_empty())
	Op1->eraseFromParent();
	- if (Op2->use_empty())
	+ if (Op1 != Op2 && Op2->use_empty())
	Op2->eraseFromParent();

	return true;
	}

	return false;
	}

	bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
	assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
	"Unexpected opcode");

	// If the reinterpret instruction operand is a PHI Node
	if (isa<PHINode>(I->getArgOperand(0)))
	return processPhiNode(I);

	// If we have a reinterpret intrinsic I of type A which is converting from
	// another reinterpret Y of type B, and the source type of Y is A, then we can
	// elide away both reinterprets if there are no other users of Y.
	auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
	if (!Y)
	return false;

	Value *SourceVal = Y->getArgOperand(0);
	if (I->getType() != SourceVal->getType())
	return false;

	I->replaceAllUsesWith(SourceVal);
	I->eraseFromParent();
	if (Y->use_empty())
	Y->eraseFromParent();

	return true;
	}

	bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
	IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
	if (!IntrI)
	return false;

	switch (IntrI->getIntrinsicID()) {
	case Intrinsic::aarch64_sve_convert_from_svbool:
	return optimizeConvertFromSVBool(IntrI);
	case Intrinsic::aarch64_sve_ptest_any:
	case Intrinsic::aarch64_sve_ptest_first:
	case Intrinsic::aarch64_sve_ptest_last:
	return optimizePTest(IntrI);
	default:
	return false;
	}

	return true;
	}

	bool SVEIntrinsicOpts::optimizeFunctions(
	SmallSetVector<Function *, 4> &Functions) {
	bool Changed = false;
	for (auto *F : Functions) {
	DominatorTree DT = &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();

	// Traverse the DT with an rpo walk so we see defs before uses, allowing
	// simplification to be done incrementally.
	BasicBlock *Root = DT->getRoot();
	ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
	for (auto *BB : RPOT)
	for (Instruction &I : make_early_inc_range(*BB))
	Changed \|= optimizeIntrinsic(&I);
	}
	return Changed;
	}

	bool SVEIntrinsicOpts::runOnModule(Module &M) {
	bool Changed = false;
	SmallSetVector<Function *, 4> Functions;

	// Check for SVE intrinsic declarations first so that we only iterate over
	// relevant functions. Where an appropriate declaration is found, store the
	// function(s) where it is used so we can target these only.
	for (auto &F : M.getFunctionList()) {
	if (!F.isDeclaration())
	continue;

	switch (F.getIntrinsicID()) {
	case Intrinsic::aarch64_sve_convert_from_svbool:
	case Intrinsic::aarch64_sve_ptest_any:
	case Intrinsic::aarch64_sve_ptest_first:
	case Intrinsic::aarch64_sve_ptest_last:
	for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
	auto Inst = dyn_cast<Instruction>(I++);
	Functions.insert(Inst->getFunction());
	}
	break;
	default:
	break;
	}
	}

	if (!Functions.empty())
	Changed \|= optimizeFunctions(Functions);

	return Changed;
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
	index d39ec505127c..7b6ea002c7b7 100644
	--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
	+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
	@@ -1,823 +1,820 @@
	//===-- RISCVInstrInfo.cpp - RISCV Instruction Information ------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the RISCV implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCVInstrInfo.h"
	#include "RISCV.h"
	#include "RISCVSubtarget.h"
	#include "RISCVTargetMachine.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TargetRegistry.h"

	using namespace llvm;

	#define GEN_CHECK_COMPRESS_INSTR
	#include "RISCVGenCompressInstEmitter.inc"

	#define GET_INSTRINFO_CTOR_DTOR
	#include "RISCVGenInstrInfo.inc"

	RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
	: RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
	STI(STI) {}

	unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	switch (MI.getOpcode()) {
	default:
	return 0;
	case RISCV::LB:
	case RISCV::LBU:
	case RISCV::LH:
	case RISCV::LHU:
	case RISCV::LW:
	case RISCV::FLW:
	case RISCV::LWU:
	case RISCV::LD:
	case RISCV::FLD:
	break;
	}

	if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
	MI.getOperand(2).getImm() == 0) {
	FrameIndex = MI.getOperand(1).getIndex();
	return MI.getOperand(0).getReg();
	}

	return 0;
	}

	unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	switch (MI.getOpcode()) {
	default:
	return 0;
	case RISCV::SB:
	case RISCV::SH:
	case RISCV::SW:
	case RISCV::FSW:
	case RISCV::SD:
	case RISCV::FSD:
	break;
	}

	if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
	MI.getOperand(2).getImm() == 0) {
	FrameIndex = MI.getOperand(1).getIndex();
	return MI.getOperand(0).getReg();
	}

	return 0;
	}

	void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, MCRegister DstReg,
	MCRegister SrcReg, bool KillSrc) const {
	if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
	BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addImm(0);
	return;
	}

	// FPR->FPR copies
	unsigned Opc;
	if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
	Opc = RISCV::FSGNJ_S;
	else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg))
	Opc = RISCV::FSGNJ_D;
	else
	llvm_unreachable("Impossible reg-to-reg copy");

	BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addReg(SrcReg, getKillRegState(KillSrc));
	}

	void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	Register SrcReg, bool IsKill, int FI,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	DebugLoc DL;
	if (I != MBB.end())
	DL = I->getDebugLoc();

	unsigned Opcode;

	if (RISCV::GPRRegClass.hasSubClassEq(RC))
	Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
	RISCV::SW : RISCV::SD;
	else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
	Opcode = RISCV::FSW;
	else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
	Opcode = RISCV::FSD;
	else
	llvm_unreachable("Can't store this register to stack slot");

	BuildMI(MBB, I, DL, get(Opcode))
	.addReg(SrcReg, getKillRegState(IsKill))
	.addFrameIndex(FI)
	.addImm(0);
	}

	void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	Register DstReg, int FI,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	DebugLoc DL;
	if (I != MBB.end())
	DL = I->getDebugLoc();

	unsigned Opcode;

	if (RISCV::GPRRegClass.hasSubClassEq(RC))
	Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
	RISCV::LW : RISCV::LD;
	else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
	Opcode = RISCV::FLW;
	else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
	Opcode = RISCV::FLD;
	else
	llvm_unreachable("Can't load this register from stack slot");

	BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
	}

	void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, Register DstReg, uint64_t Val,
	MachineInstr::MIFlag Flag) const {
	MachineFunction *MF = MBB.getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	bool IsRV64 = MF->getSubtarget<RISCVSubtarget>().is64Bit();
	Register SrcReg = RISCV::X0;
	Register Result = MRI.createVirtualRegister(&RISCV::GPRRegClass);
	unsigned Num = 0;

	if (!IsRV64 && !isInt<32>(Val))
	report_fatal_error("Should only materialize 32-bit constants for RV32");

	RISCVMatInt::InstSeq Seq;
	RISCVMatInt::generateInstSeq(Val, IsRV64, Seq);
	assert(Seq.size() > 0);

	for (RISCVMatInt::Inst &Inst : Seq) {
	// Write the final result to DstReg if it's the last instruction in the Seq.
	// Otherwise, write the result to the temp register.
	if (++Num == Seq.size())
	Result = DstReg;

	if (Inst.Opc == RISCV::LUI) {
	BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result)
	.addImm(Inst.Imm)
	.setMIFlag(Flag);
	} else {
	BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
	.addReg(SrcReg, RegState::Kill)
	.addImm(Inst.Imm)
	.setMIFlag(Flag);
	}
	// Only the first instruction has X0 as its source.
	SrcReg = Result;
	}
	}

	// The contents of values added to Cond are not examined outside of
	// RISCVInstrInfo, giving us flexibility in what to push to it. For RISCV, we
	// push BranchOpcode, Reg1, Reg2.
	static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
	SmallVectorImpl<MachineOperand> &Cond) {
	// Block ends with fall-through condbranch.
	assert(LastInst.getDesc().isConditionalBranch() &&
	"Unknown conditional branch");
	Target = LastInst.getOperand(2).getMBB();
	Cond.push_back(MachineOperand::CreateImm(LastInst.getOpcode()));
	Cond.push_back(LastInst.getOperand(0));
	Cond.push_back(LastInst.getOperand(1));
	}

	static unsigned getOppositeBranchOpcode(int Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("Unrecognized conditional branch");
	case RISCV::BEQ:
	return RISCV::BNE;
	case RISCV::BNE:
	return RISCV::BEQ;
	case RISCV::BLT:
	return RISCV::BGE;
	case RISCV::BGE:
	return RISCV::BLT;
	case RISCV::BLTU:
	return RISCV::BGEU;
	case RISCV::BGEU:
	return RISCV::BLTU;
	}
	}

	bool RISCVInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	TBB = FBB = nullptr;
	Cond.clear();

	// If the block has no terminators, it just falls into the block after it.
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end() \|\| !isUnpredicatedTerminator(*I))
	return false;

	// Count the number of terminators and find the first unconditional or
	// indirect branch.
	MachineBasicBlock::iterator FirstUncondOrIndirectBr = MBB.end();
	int NumTerminators = 0;
	for (auto J = I.getReverse(); J != MBB.rend() && isUnpredicatedTerminator(*J);
	J++) {
	NumTerminators++;
	if (J->getDesc().isUnconditionalBranch() \|\|
	J->getDesc().isIndirectBranch()) {
	FirstUncondOrIndirectBr = J.getReverse();
	}
	}

	// If AllowModify is true, we can erase any terminators after
	// FirstUncondOrIndirectBR.
	if (AllowModify && FirstUncondOrIndirectBr != MBB.end()) {
	while (std::next(FirstUncondOrIndirectBr) != MBB.end()) {
	std::next(FirstUncondOrIndirectBr)->eraseFromParent();
	NumTerminators--;
	}
	I = FirstUncondOrIndirectBr;
	}

	// We can't handle blocks that end in an indirect branch.
	if (I->getDesc().isIndirectBranch())
	return true;

	// We can't handle blocks with more than 2 terminators.
	if (NumTerminators > 2)
	return true;

	// Handle a single unconditional branch.
	if (NumTerminators == 1 && I->getDesc().isUnconditionalBranch()) {
	- TBB = I->getOperand(0).getMBB();
	+ TBB = getBranchDestBlock(*I);
	return false;
	}

	// Handle a single conditional branch.
	if (NumTerminators == 1 && I->getDesc().isConditionalBranch()) {
	parseCondBranch(*I, TBB, Cond);
	return false;
	}

	// Handle a conditional branch followed by an unconditional branch.
	if (NumTerminators == 2 && std::prev(I)->getDesc().isConditionalBranch() &&
	I->getDesc().isUnconditionalBranch()) {
	parseCondBranch(*std::prev(I), TBB, Cond);
	- FBB = I->getOperand(0).getMBB();
	+ FBB = getBranchDestBlock(*I);
	return false;
	}

	// Otherwise, we can't handle this.
	return true;
	}

	unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved) const {
	if (BytesRemoved)
	*BytesRemoved = 0;
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return 0;

	if (!I->getDesc().isUnconditionalBranch() &&
	!I->getDesc().isConditionalBranch())
	return 0;

	// Remove the branch.
	if (BytesRemoved)
	BytesRemoved += getInstSizeInBytes(I);
	I->eraseFromParent();

	I = MBB.end();

	if (I == MBB.begin())
	return 1;
	--I;
	if (!I->getDesc().isConditionalBranch())
	return 1;

	// Remove the branch.
	if (BytesRemoved)
	BytesRemoved += getInstSizeInBytes(I);
	I->eraseFromParent();
	return 2;
	}

	// Inserts a branch into the end of the specific MachineBasicBlock, returning
	// the number of instructions inserted.
	unsigned RISCVInstrInfo::insertBranch(
	MachineBasicBlock &MBB, MachineBasicBlock TBB, MachineBasicBlock FBB,
	ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
	if (BytesAdded)
	*BytesAdded = 0;

	// Shouldn't be a fall through.
	assert(TBB && "insertBranch must not be told to insert a fallthrough");
	assert((Cond.size() == 3 \|\| Cond.size() == 0) &&
	"RISCV branch conditions have two components!");

	// Unconditional branch.
	if (Cond.empty()) {
	MachineInstr &MI = *BuildMI(&MBB, DL, get(RISCV::PseudoBR)).addMBB(TBB);
	if (BytesAdded)
	*BytesAdded += getInstSizeInBytes(MI);
	return 1;
	}

	// Either a one or two-way conditional branch.
	unsigned Opc = Cond[0].getImm();
	MachineInstr &CondMI =
	*BuildMI(&MBB, DL, get(Opc)).add(Cond[1]).add(Cond[2]).addMBB(TBB);
	if (BytesAdded)
	*BytesAdded += getInstSizeInBytes(CondMI);

	// One-way conditional branch.
	if (!FBB)
	return 1;

	// Two-way conditional branch.
	MachineInstr &MI = *BuildMI(&MBB, DL, get(RISCV::PseudoBR)).addMBB(FBB);
	if (BytesAdded)
	*BytesAdded += getInstSizeInBytes(MI);
	return 2;
	}

	unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
	MachineBasicBlock &DestBB,
	const DebugLoc &DL,
	int64_t BrOffset,
	RegScavenger *RS) const {
	assert(RS && "RegScavenger required for long branching");
	assert(MBB.empty() &&
	"new block should be inserted for expanding unconditional branch");
	assert(MBB.pred_size() == 1);

	MachineFunction *MF = MBB.getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	- const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
	-
	- if (TM.isPositionIndependent())
	- report_fatal_error("Unable to insert indirect branch");

	if (!isInt<32>(BrOffset))
	report_fatal_error(
	"Branch offsets outside of the signed 32-bit range not supported");

	// FIXME: A virtual register must be used initially, as the register
	// scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
	// uses the same workaround).
	Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
	auto II = MBB.end();

	- MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg)
	- .addMBB(&DestBB, RISCVII::MO_HI);
	- BuildMI(MBB, II, DL, get(RISCV::PseudoBRIND))
	- .addReg(ScratchReg, RegState::Kill)
	- .addMBB(&DestBB, RISCVII::MO_LO);
	+ MachineInstr &MI = *BuildMI(MBB, II, DL, get(RISCV::PseudoJump))
	+ .addReg(ScratchReg, RegState::Define \| RegState::Dead)
	+ .addMBB(&DestBB, RISCVII::MO_CALL);

	RS->enterBasicBlockEnd(MBB);
	unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
	- LuiMI.getIterator(), false, 0);
	+ MI.getIterator(), false, 0);
	MRI.replaceRegWith(ScratchReg, Scav);
	MRI.clearVirtRegs();
	RS->setRegUsed(Scav);
	return 8;
	}

	bool RISCVInstrInfo::reverseBranchCondition(
	SmallVectorImpl<MachineOperand> &Cond) const {
	assert((Cond.size() == 3) && "Invalid branch condition!");
	Cond[0].setImm(getOppositeBranchOpcode(Cond[0].getImm()));
	return false;
	}

	MachineBasicBlock *
	RISCVInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
	assert(MI.getDesc().isBranch() && "Unexpected opcode!");
	// The branch target is always the last operand.
	int NumOp = MI.getNumExplicitOperands();
	return MI.getOperand(NumOp - 1).getMBB();
	}

	bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
	int64_t BrOffset) const {
	+ unsigned XLen = STI.getXLen();
	// Ideally we could determine the supported branch offset from the
	// RISCVII::FormMask, but this can't be used for Pseudo instructions like
	// PseudoBR.
	switch (BranchOp) {
	default:
	llvm_unreachable("Unexpected opcode!");
	case RISCV::BEQ:
	case RISCV::BNE:
	case RISCV::BLT:
	case RISCV::BGE:
	case RISCV::BLTU:
	case RISCV::BGEU:
	return isIntN(13, BrOffset);
	case RISCV::JAL:
	case RISCV::PseudoBR:
	return isIntN(21, BrOffset);
	+ case RISCV::PseudoJump:
	+ return isIntN(32, SignExtend64(BrOffset + 0x800, XLen));
	}
	}

	unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
	unsigned Opcode = MI.getOpcode();

	switch (Opcode) {
	default: {
	if (MI.getParent() && MI.getParent()->getParent()) {
	const auto MF = MI.getMF();
	const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
	const MCRegisterInfo &MRI = *TM.getMCRegisterInfo();
	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
	const RISCVSubtarget &ST = MF->getSubtarget<RISCVSubtarget>();
	if (isCompressibleInst(MI, &ST, MRI, STI))
	return 2;
	}
	return get(Opcode).getSize();
	}
	case TargetOpcode::EH_LABEL:
	case TargetOpcode::IMPLICIT_DEF:
	case TargetOpcode::KILL:
	case TargetOpcode::DBG_VALUE:
	return 0;
	// These values are determined based on RISCVExpandAtomicPseudoInsts,
	// RISCVExpandPseudoInsts and RISCVMCCodeEmitter, depending on where the
	// pseudos are expanded.
	case RISCV::PseudoCALLReg:
	case RISCV::PseudoCALL:
	case RISCV::PseudoJump:
	case RISCV::PseudoTAIL:
	case RISCV::PseudoLLA:
	case RISCV::PseudoLA:
	case RISCV::PseudoLA_TLS_IE:
	case RISCV::PseudoLA_TLS_GD:
	return 8;
	case RISCV::PseudoAtomicLoadNand32:
	case RISCV::PseudoAtomicLoadNand64:
	return 20;
	case RISCV::PseudoMaskedAtomicSwap32:
	case RISCV::PseudoMaskedAtomicLoadAdd32:
	case RISCV::PseudoMaskedAtomicLoadSub32:
	return 28;
	case RISCV::PseudoMaskedAtomicLoadNand32:
	return 32;
	case RISCV::PseudoMaskedAtomicLoadMax32:
	case RISCV::PseudoMaskedAtomicLoadMin32:
	return 44;
	case RISCV::PseudoMaskedAtomicLoadUMax32:
	case RISCV::PseudoMaskedAtomicLoadUMin32:
	return 36;
	case RISCV::PseudoCmpXchg32:
	case RISCV::PseudoCmpXchg64:
	return 16;
	case RISCV::PseudoMaskedCmpXchg32:
	return 32;
	case TargetOpcode::INLINEASM:
	case TargetOpcode::INLINEASM_BR: {
	const MachineFunction &MF = *MI.getParent()->getParent();
	const auto &TM = static_cast<const RISCVTargetMachine &>(MF.getTarget());
	return getInlineAsmLength(MI.getOperand(0).getSymbolName(),
	*TM.getMCAsmInfo());
	}
	}
	}

	bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
	const unsigned Opcode = MI.getOpcode();
	switch(Opcode) {
	default:
	break;
	case RISCV::ADDI:
	case RISCV::ORI:
	case RISCV::XORI:
	return (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0);
	}
	return MI.isAsCheapAsAMove();
	}

	bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
	StringRef &ErrInfo) const {
	const MCInstrInfo *MCII = STI.getInstrInfo();
	MCInstrDesc const &Desc = MCII->get(MI.getOpcode());

	for (auto &OI : enumerate(Desc.operands())) {
	unsigned OpType = OI.value().OperandType;
	if (OpType >= RISCVOp::OPERAND_FIRST_RISCV_IMM &&
	OpType <= RISCVOp::OPERAND_LAST_RISCV_IMM) {
	const MachineOperand &MO = MI.getOperand(OI.index());
	if (MO.isImm()) {
	int64_t Imm = MO.getImm();
	bool Ok;
	switch (OpType) {
	default:
	llvm_unreachable("Unexpected operand type");
	case RISCVOp::OPERAND_UIMM4:
	Ok = isUInt<4>(Imm);
	break;
	case RISCVOp::OPERAND_UIMM5:
	Ok = isUInt<5>(Imm);
	break;
	case RISCVOp::OPERAND_UIMM12:
	Ok = isUInt<12>(Imm);
	break;
	case RISCVOp::OPERAND_SIMM12:
	Ok = isInt<12>(Imm);
	break;
	case RISCVOp::OPERAND_SIMM13_LSB0:
	Ok = isShiftedInt<12, 1>(Imm);
	break;
	case RISCVOp::OPERAND_UIMM20:
	Ok = isUInt<20>(Imm);
	break;
	case RISCVOp::OPERAND_SIMM21_LSB0:
	Ok = isShiftedInt<20, 1>(Imm);
	break;
	case RISCVOp::OPERAND_UIMMLOG2XLEN:
	if (STI.getTargetTriple().isArch64Bit())
	Ok = isUInt<6>(Imm);
	else
	Ok = isUInt<5>(Imm);
	break;
	}
	if (!Ok) {
	ErrInfo = "Invalid immediate";
	return false;
	}
	}
	}
	}

	return true;
	}

	// Return true if get the base operand, byte offset of an instruction and the
	// memory width. Width is the size of memory that is being loaded/stored.
	bool RISCVInstrInfo::getMemOperandWithOffsetWidth(
	const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset,
	unsigned &Width, const TargetRegisterInfo *TRI) const {
	if (!LdSt.mayLoadOrStore())
	return false;

	// Here we assume the standard RISC-V ISA, which uses a base+offset
	// addressing mode. You'll need to relax these conditions to support custom
	// load/stores instructions.
	if (LdSt.getNumExplicitOperands() != 3)
	return false;
	if (!LdSt.getOperand(1).isReg() \|\| !LdSt.getOperand(2).isImm())
	return false;

	if (!LdSt.hasOneMemOperand())
	return false;

	Width = (*LdSt.memoperands_begin())->getSize();
	BaseReg = &LdSt.getOperand(1);
	Offset = LdSt.getOperand(2).getImm();
	return true;
	}

	bool RISCVInstrInfo::areMemAccessesTriviallyDisjoint(
	const MachineInstr &MIa, const MachineInstr &MIb) const {
	assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
	assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");

	if (MIa.hasUnmodeledSideEffects() \|\| MIb.hasUnmodeledSideEffects() \|\|
	MIa.hasOrderedMemoryRef() \|\| MIb.hasOrderedMemoryRef())
	return false;

	// Retrieve the base register, offset from the base register and width. Width
	// is the size of memory that is being loaded/stored (e.g. 1, 2, 4). If
	// base registers are identical, and the offset of a lower memory access +
	// the width doesn't overlap the offset of a higher memory access,
	// then the memory accesses are different.
	const TargetRegisterInfo *TRI = STI.getRegisterInfo();
	const MachineOperand BaseOpA = nullptr, BaseOpB = nullptr;
	int64_t OffsetA = 0, OffsetB = 0;
	unsigned int WidthA = 0, WidthB = 0;
	if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
	getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
	if (BaseOpA->isIdenticalTo(*BaseOpB)) {
	int LowOffset = std::min(OffsetA, OffsetB);
	int HighOffset = std::max(OffsetA, OffsetB);
	int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
	if (LowOffset + LowWidth <= HighOffset)
	return true;
	}
	}
	return false;
	}

	std::pair<unsigned, unsigned>
	RISCVInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
	const unsigned Mask = RISCVII::MO_DIRECT_FLAG_MASK;
	return std::make_pair(TF & Mask, TF & ~Mask);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	RISCVInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
	using namespace RISCVII;
	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_CALL, "riscv-call"},
	{MO_PLT, "riscv-plt"},
	{MO_LO, "riscv-lo"},
	{MO_HI, "riscv-hi"},
	{MO_PCREL_LO, "riscv-pcrel-lo"},
	{MO_PCREL_HI, "riscv-pcrel-hi"},
	{MO_GOT_HI, "riscv-got-hi"},
	{MO_TPREL_LO, "riscv-tprel-lo"},
	{MO_TPREL_HI, "riscv-tprel-hi"},
	{MO_TPREL_ADD, "riscv-tprel-add"},
	{MO_TLS_GOT_HI, "riscv-tls-got-hi"},
	{MO_TLS_GD_HI, "riscv-tls-gd-hi"}};
	return makeArrayRef(TargetFlags);
	}
	bool RISCVInstrInfo::isFunctionSafeToOutlineFrom(
	MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
	const Function &F = MF.getFunction();

	// Can F be deduplicated by the linker? If it can, don't outline from it.
	if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
	return false;

	// Don't outline from functions with section markings; the program could
	// expect that all the code is in the named section.
	if (F.hasSection())
	return false;

	// It's safe to outline from MF.
	return true;
	}

	bool RISCVInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
	unsigned &Flags) const {
	// More accurate safety checking is done in getOutliningCandidateInfo.
	return true;
	}

	// Enum values indicating how an outlined call should be constructed.
	enum MachineOutlinerConstructionID {
	MachineOutlinerDefault
	};

	outliner::OutlinedFunction RISCVInstrInfo::getOutliningCandidateInfo(
	std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {

	// First we need to filter out candidates where the X5 register (IE t0) can't
	// be used to setup the function call.
	auto CannotInsertCall = [](outliner::Candidate &C) {
	const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();

	C.initLRU(*TRI);
	LiveRegUnits LRU = C.LRU;
	return !LRU.available(RISCV::X5);
	};

	RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
	RepeatedSequenceLocs.end(),
	CannotInsertCall),
	RepeatedSequenceLocs.end());

	// If the sequence doesn't have enough candidates left, then we're done.
	if (RepeatedSequenceLocs.size() < 2)
	return outliner::OutlinedFunction();

	unsigned SequenceSize = 0;

	auto I = RepeatedSequenceLocs[0].front();
	auto E = std::next(RepeatedSequenceLocs[0].back());
	for (; I != E; ++I)
	SequenceSize += getInstSizeInBytes(*I);

	// call t0, function = 8 bytes.
	unsigned CallOverhead = 8;
	for (auto &C : RepeatedSequenceLocs)
	C.setCallInfo(MachineOutlinerDefault, CallOverhead);

	// jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
	unsigned FrameOverhead = 4;
	if (RepeatedSequenceLocs[0].getMF()->getSubtarget()
	.getFeatureBits()[RISCV::FeatureStdExtC])
	FrameOverhead = 2;

	return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
	FrameOverhead, MachineOutlinerDefault);
	}

	outliner::InstrType
	RISCVInstrInfo::getOutliningType(MachineBasicBlock::iterator &MBBI,
	unsigned Flags) const {
	MachineInstr &MI = *MBBI;
	MachineBasicBlock *MBB = MI.getParent();
	const TargetRegisterInfo *TRI =
	MBB->getParent()->getSubtarget().getRegisterInfo();

	// Positions generally can't safely be outlined.
	if (MI.isPosition()) {
	// We can manually strip out CFI instructions later.
	if (MI.isCFIInstruction())
	return outliner::InstrType::Invisible;

	return outliner::InstrType::Illegal;
	}

	// Don't trust the user to write safe inline assembly.
	if (MI.isInlineAsm())
	return outliner::InstrType::Illegal;

	// We can't outline branches to other basic blocks.
	if (MI.isTerminator() && !MBB->succ_empty())
	return outliner::InstrType::Illegal;

	// We need support for tail calls to outlined functions before return
	// statements can be allowed.
	if (MI.isReturn())
	return outliner::InstrType::Illegal;

	// Don't allow modifying the X5 register which we use for return addresses for
	// these outlined functions.
	if (MI.modifiesRegister(RISCV::X5, TRI) \|\|
	MI.getDesc().hasImplicitDefOfPhysReg(RISCV::X5))
	return outliner::InstrType::Illegal;

	// Make sure the operands don't reference something unsafe.
	for (const auto &MO : MI.operands())
	if (MO.isMBB() \|\| MO.isBlockAddress() \|\| MO.isCPI())
	return outliner::InstrType::Illegal;

	// Don't allow instructions which won't be materialized to impact outlining
	// analysis.
	if (MI.isMetaInstruction())
	return outliner::InstrType::Invisible;

	return outliner::InstrType::Legal;
	}

	void RISCVInstrInfo::buildOutlinedFrame(
	MachineBasicBlock &MBB, MachineFunction &MF,
	const outliner::OutlinedFunction &OF) const {

	// Strip out any CFI instructions
	bool Changed = true;
	while (Changed) {
	Changed = false;
	auto I = MBB.begin();
	auto E = MBB.end();
	for (; I != E; ++I) {
	if (I->isCFIInstruction()) {
	I->removeFromParent();
	Changed = true;
	break;
	}
	}
	}

	MBB.addLiveIn(RISCV::X5);

	// Add in a return instruction to the end of the outlined frame.
	MBB.insert(MBB.end(), BuildMI(MF, DebugLoc(), get(RISCV::JALR))
	.addReg(RISCV::X0, RegState::Define)
	.addReg(RISCV::X5)
	.addImm(0));
	}

	MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
	Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
	MachineFunction &MF, const outliner::Candidate &C) const {

	// Add in a call instruction to the outlined function at the given location.
	It = MBB.insert(It,
	BuildMI(MF, DebugLoc(), get(RISCV::PseudoCALLReg), RISCV::X5)
	.addGlobalAddress(M.getNamedValue(MF.getName()), 0,
	RISCVII::MO_CALL));
	return It;
	}
	diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
	index b9483062ddeb..8547f791092b 100644
	--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
	+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
	@@ -1,1180 +1,1180 @@
	//===-- RISCVInstrInfo.td - Target Description for RISCV ---- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the RISC-V instructions in TableGen format.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// RISC-V specific DAG Nodes.
	//===----------------------------------------------------------------------===//

	// Target-independent type requirements, but with target-specific formats.
	def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>]>;
	def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>]>;

	// Target-dependent type requirements.
	def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
	def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
	SDTCisSameAs<0, 4>,
	SDTCisSameAs<4, 5>]>;

	// Target-independent nodes, but with target-specific formats.
	def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
	[SDNPHasChain, SDNPOutGlue]>;
	def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	// Target-dependent nodes.
	def riscv_call : SDNode<"RISCVISD::CALL", SDT_RISCVCall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def riscv_ret_flag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def riscv_uret_flag : SDNode<"RISCVISD::URET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue]>;
	def riscv_sret_flag : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue]>;
	def riscv_mret_flag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue]>;
	def riscv_selectcc : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
	[SDNPInGlue]>;
	def riscv_tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def riscv_sllw : SDNode<"RISCVISD::SLLW", SDTIntShiftOp>;
	def riscv_sraw : SDNode<"RISCVISD::SRAW", SDTIntShiftOp>;
	def riscv_srlw : SDNode<"RISCVISD::SRLW", SDTIntShiftOp>;

	//===----------------------------------------------------------------------===//
	// Operand and SDNode transformation definitions.
	//===----------------------------------------------------------------------===//

	class ImmXLenAsmOperand<string prefix, string suffix = ""> : AsmOperandClass {
	let Name = prefix # "ImmXLen" # suffix;
	let RenderMethod = "addImmOperands";
	let DiagnosticType = !strconcat("Invalid", Name);
	}

	class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass {
	let Name = prefix # "Imm" # width # suffix;
	let RenderMethod = "addImmOperands";
	let DiagnosticType = !strconcat("Invalid", Name);
	}

	def ImmZeroAsmOperand : AsmOperandClass {
	let Name = "ImmZero";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = !strconcat("Invalid", Name);
	}

	class SImmAsmOperand<int width, string suffix = "">
	: ImmAsmOperand<"S", width, suffix> {
	}

	class UImmAsmOperand<int width, string suffix = "">
	: ImmAsmOperand<"U", width, suffix> {
	}

	def FenceArg : AsmOperandClass {
	let Name = "FenceArg";
	let RenderMethod = "addFenceArgOperands";
	let DiagnosticType = "InvalidFenceArg";
	}

	def fencearg : Operand<XLenVT> {
	let ParserMatchClass = FenceArg;
	let PrintMethod = "printFenceArg";
	let DecoderMethod = "decodeUImmOperand<4>";
	let OperandType = "OPERAND_UIMM4";
	let OperandNamespace = "RISCVOp";
	}

	def UImmLog2XLenAsmOperand : AsmOperandClass {
	let Name = "UImmLog2XLen";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidUImmLog2XLen";
	}

	def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{
	if (Subtarget->is64Bit())
	return isUInt<6>(Imm);
	return isUInt<5>(Imm);
	}]> {
	let ParserMatchClass = UImmLog2XLenAsmOperand;
	// TODO: should ensure invalid shamt is rejected when decoding.
	let DecoderMethod = "decodeUImmOperand<6>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (!MCOp.evaluateAsConstantImm(Imm))
	return false;
	if (STI.getTargetTriple().isArch64Bit())
	return isUInt<6>(Imm);
	return isUInt<5>(Imm);
	}];
	let OperandType = "OPERAND_UIMMLOG2XLEN";
	let OperandNamespace = "RISCVOp";
	}

	def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
	let ParserMatchClass = UImmAsmOperand<5>;
	let DecoderMethod = "decodeUImmOperand<5>";
	let OperandType = "OPERAND_UIMM5";
	let OperandNamespace = "RISCVOp";
	}

	def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
	let ParserMatchClass = SImmAsmOperand<12>;
	let EncoderMethod = "getImmOpValue";
	let DecoderMethod = "decodeSImmOperand<12>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (MCOp.evaluateAsConstantImm(Imm))
	return isInt<12>(Imm);
	return MCOp.isBareSymbolRef();
	}];
	let OperandType = "OPERAND_SIMM12";
	let OperandNamespace = "RISCVOp";
	}

	// A 12-bit signed immediate plus one where the imm range will be -2047~2048.
	def simm12_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
	[{return (isInt<12>(Imm) && Imm != -2048) \|\| Imm == 2048;}]> {
	let ParserMatchClass = SImmAsmOperand<12>;
	let EncoderMethod = "getImmOpValue";
	let DecoderMethod = "decodeSImmOperand<12>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (MCOp.evaluateAsConstantImm(Imm))
	return (isInt<12>(Imm) && Imm != -2048) \|\| Imm == 2048;
	return MCOp.isBareSymbolRef();
	}];
	}

	// A 13-bit signed immediate where the least significant bit is zero.
	def simm13_lsb0 : Operand<OtherVT> {
	let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
	let EncoderMethod = "getImmOpValueAsr1";
	let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (MCOp.evaluateAsConstantImm(Imm))
	return isShiftedInt<12, 1>(Imm);
	return MCOp.isBareSymbolRef();
	}];
	let OperandType = "OPERAND_SIMM13_LSB0";
	let OperandNamespace = "RISCVOp";
	}

	class UImm20Operand : Operand<XLenVT> {
	let EncoderMethod = "getImmOpValue";
	let DecoderMethod = "decodeUImmOperand<20>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (MCOp.evaluateAsConstantImm(Imm))
	return isUInt<20>(Imm);
	return MCOp.isBareSymbolRef();
	}];
	let OperandType = "OPERAND_UIMM20";
	let OperandNamespace = "RISCVOp";
	}

	def uimm20_lui : UImm20Operand {
	let ParserMatchClass = UImmAsmOperand<20, "LUI">;
	}
	def uimm20_auipc : UImm20Operand {
	let ParserMatchClass = UImmAsmOperand<20, "AUIPC">;
	}

	def Simm21Lsb0JALAsmOperand : SImmAsmOperand<21, "Lsb0JAL"> {
	let ParserMethod = "parseJALOffset";
	}

	// A 21-bit signed immediate where the least significant bit is zero.
	def simm21_lsb0_jal : Operand<OtherVT> {
	let ParserMatchClass = Simm21Lsb0JALAsmOperand;
	let EncoderMethod = "getImmOpValueAsr1";
	let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
	let MCOperandPredicate = [{
	int64_t Imm;
	if (MCOp.evaluateAsConstantImm(Imm))
	return isShiftedInt<20, 1>(Imm);
	return MCOp.isBareSymbolRef();
	}];
	let OperandType = "OPERAND_SIMM21_LSB0";
	let OperandNamespace = "RISCVOp";
	}

	def BareSymbol : AsmOperandClass {
	let Name = "BareSymbol";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidBareSymbol";
	let ParserMethod = "parseBareSymbol";
	}

	// A bare symbol.
	def bare_symbol : Operand<XLenVT> {
	let ParserMatchClass = BareSymbol;
	}

	def CallSymbol : AsmOperandClass {
	let Name = "CallSymbol";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidCallSymbol";
	let ParserMethod = "parseCallSymbol";
	}

	// A bare symbol used in call/tail only.
	def call_symbol : Operand<XLenVT> {
	let ParserMatchClass = CallSymbol;
	}

	def PseudoJumpSymbol : AsmOperandClass {
	let Name = "PseudoJumpSymbol";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidPseudoJumpSymbol";
	let ParserMethod = "parsePseudoJumpSymbol";
	}

	// A bare symbol used for pseudo jumps only.
	def pseudo_jump_symbol : Operand<XLenVT> {
	let ParserMatchClass = PseudoJumpSymbol;
	}

	def TPRelAddSymbol : AsmOperandClass {
	let Name = "TPRelAddSymbol";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidTPRelAddSymbol";
	let ParserMethod = "parseOperandWithModifier";
	}

	// A bare symbol with the %tprel_add variant.
	def tprel_add_symbol : Operand<XLenVT> {
	let ParserMatchClass = TPRelAddSymbol;
	}

	def CSRSystemRegister : AsmOperandClass {
	let Name = "CSRSystemRegister";
	let ParserMethod = "parseCSRSystemRegister";
	let DiagnosticType = "InvalidCSRSystemRegister";
	}

	def csr_sysreg : Operand<XLenVT> {
	let ParserMatchClass = CSRSystemRegister;
	let PrintMethod = "printCSRSystemRegister";
	let DecoderMethod = "decodeUImmOperand<12>";
	let OperandType = "OPERAND_UIMM12";
	let OperandNamespace = "RISCVOp";
	}

	// A parameterized register class alternative to i32imm/i64imm from Target.td.
	def ixlenimm : Operand<XLenVT>;

	def ixlenimm_li : Operand<XLenVT> {
	let ParserMatchClass = ImmXLenAsmOperand<"", "LI">;
	}

	// Standalone (codegen-only) immleaf patterns.
	def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
	def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
	// A mask value that won't affect significant shift bits.
	def immbottomxlenset : ImmLeaf<XLenVT, [{
	if (Subtarget->is64Bit())
	return countTrailingOnes<uint64_t>(Imm) >= 6;
	return countTrailingOnes<uint64_t>(Imm) >= 5;
	}]>;

	// Addressing modes.
	// Necessary because a frameindex can't be matched directly in a pattern.
	def AddrFI : ComplexPattern<iPTR, 1, "SelectAddrFI", [frameindex], []>;

	// Extract least significant 12 bits from an immediate value and sign extend
	// them.
	def LO12Sext : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(SignExtend64<12>(N->getZExtValue()),
	SDLoc(N), N->getValueType(0));
	}]>;

	// Extract the most significant 20 bits from an immediate value. Add 1 if bit
	// 11 is 1, to compensate for the low 12 bits in the matching immediate addi
	// or ld/st being negative.
	def HI20 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(((N->getZExtValue()+0x800) >> 12) & 0xfffff,
	SDLoc(N), N->getValueType(0));
	}]>;

	// Return the negation of an immediate value.
	def NegImm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N),
	N->getValueType(0));
	}]>;

	//===----------------------------------------------------------------------===//
	// Instruction Formats
	//===----------------------------------------------------------------------===//

	include "RISCVInstrFormats.td"

	//===----------------------------------------------------------------------===//
	// Instruction Class Templates
	//===----------------------------------------------------------------------===//

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class BranchCC_rri<bits<3> funct3, string opcodestr>
	: RVInstB<funct3, OPC_BRANCH, (outs),
	(ins GPR:$rs1, GPR:$rs2, simm13_lsb0:$imm12),
	opcodestr, "$rs1, $rs2, $imm12">,
	Sched<[WriteJmp, ReadJmp, ReadJmp]> {
	let isBranch = 1;
	let isTerminator = 1;
	}

	let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
	class Load_ri<bits<3> funct3, string opcodestr>
	: RVInstI<funct3, OPC_LOAD, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
	opcodestr, "$rd, ${imm12}(${rs1})">;

	// Operands for stores are in the order srcreg, base, offset rather than
	// reflecting the order these fields are specified in the instruction
	// encoding.
	let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
	class Store_rri<bits<3> funct3, string opcodestr>
	: RVInstS<funct3, OPC_STORE, (outs),
	(ins GPR:$rs2, GPR:$rs1, simm12:$imm12),
	opcodestr, "$rs2, ${imm12}(${rs1})">;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class ALU_ri<bits<3> funct3, string opcodestr>
	: RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
	opcodestr, "$rd, $rs1, $imm12">,
	Sched<[WriteIALU, ReadIALU]>;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class Shift_ri<bit arithshift, bits<3> funct3, string opcodestr>
	: RVInstIShift<arithshift, funct3, OPC_OP_IMM, (outs GPR:$rd),
	(ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
	"$rd, $rs1, $shamt">,
	Sched<[WriteShift, ReadShift]>;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
	: RVInstR<funct7, funct3, OPC_OP, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
	opcodestr, "$rd, $rs1, $rs2">;

	let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
	class CSR_ir<bits<3> funct3, string opcodestr>
	: RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd), (ins csr_sysreg:$imm12, GPR:$rs1),
	opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR, ReadCSR]>;

	let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
	class CSR_ii<bits<3> funct3, string opcodestr>
	: RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd),
	(ins csr_sysreg:$imm12, uimm5:$rs1),
	opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR]>;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class ShiftW_ri<bit arithshift, bits<3> funct3, string opcodestr>
	: RVInstIShiftW<arithshift, funct3, OPC_OP_IMM_32, (outs GPR:$rd),
	(ins GPR:$rs1, uimm5:$shamt), opcodestr,
	"$rd, $rs1, $shamt">,
	Sched<[WriteShift32, ReadShift32]>;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
	: RVInstR<funct7, funct3, OPC_OP_32, (outs GPR:$rd),
	(ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2">;

	let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
	class Priv<string opcodestr, bits<7> funct7>
	: RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1, GPR:$rs2),
	opcodestr, "">;

	//===----------------------------------------------------------------------===//
	// Instructions
	//===----------------------------------------------------------------------===//

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def LUI : RVInstU<OPC_LUI, (outs GPR:$rd), (ins uimm20_lui:$imm20),
	"lui", "$rd, $imm20">, Sched<[WriteIALU]>;

	def AUIPC : RVInstU<OPC_AUIPC, (outs GPR:$rd), (ins uimm20_auipc:$imm20),
	"auipc", "$rd, $imm20">, Sched<[WriteIALU]>;

	let isCall = 1 in
	def JAL : RVInstJ<OPC_JAL, (outs GPR:$rd), (ins simm21_lsb0_jal:$imm20),
	"jal", "$rd, $imm20">, Sched<[WriteJal]>;

	let isCall = 1 in
	def JALR : RVInstI<0b000, OPC_JALR, (outs GPR:$rd),
	(ins GPR:$rs1, simm12:$imm12),
	"jalr", "$rd, ${imm12}(${rs1})">,
	Sched<[WriteJalr, ReadJalr]>;
	} // hasSideEffects = 0, mayLoad = 0, mayStore = 0

	def BEQ : BranchCC_rri<0b000, "beq">;
	def BNE : BranchCC_rri<0b001, "bne">;
	def BLT : BranchCC_rri<0b100, "blt">;
	def BGE : BranchCC_rri<0b101, "bge">;
	def BLTU : BranchCC_rri<0b110, "bltu">;
	def BGEU : BranchCC_rri<0b111, "bgeu">;

	def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
	def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
	def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
	def LBU : Load_ri<0b100, "lbu">, Sched<[WriteLDB, ReadMemBase]>;
	def LHU : Load_ri<0b101, "lhu">, Sched<[WriteLDH, ReadMemBase]>;

	def SB : Store_rri<0b000, "sb">, Sched<[WriteSTB, ReadStoreData, ReadMemBase]>;
	def SH : Store_rri<0b001, "sh">, Sched<[WriteSTH, ReadStoreData, ReadMemBase]>;
	def SW : Store_rri<0b010, "sw">, Sched<[WriteSTW, ReadStoreData, ReadMemBase]>;

	// ADDI isn't always rematerializable, but isReMaterializable will be used as
	// a hint which is verified in isReallyTriviallyReMaterializable.
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def ADDI : ALU_ri<0b000, "addi">;

	def SLTI : ALU_ri<0b010, "slti">;
	def SLTIU : ALU_ri<0b011, "sltiu">;

	let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	def XORI : ALU_ri<0b100, "xori">;
	def ORI : ALU_ri<0b110, "ori">;
	}

	def ANDI : ALU_ri<0b111, "andi">;

	def SLLI : Shift_ri<0, 0b001, "slli">;
	def SRLI : Shift_ri<0, 0b101, "srli">;
	def SRAI : Shift_ri<1, 0b101, "srai">;

	def ADD : ALU_rr<0b0000000, 0b000, "add">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def SUB : ALU_rr<0b0100000, 0b000, "sub">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def SLL : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def SLT : ALU_rr<0b0000000, 0b010, "slt">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def SLTU : ALU_rr<0b0000000, 0b011, "sltu">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def XOR : ALU_rr<0b0000000, 0b100, "xor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def SRL : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def SRA : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def OR : ALU_rr<0b0000000, 0b110, "or">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
	def AND : ALU_rr<0b0000000, 0b111, "and">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;

	let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
	def FENCE : RVInstI<0b000, OPC_MISC_MEM, (outs),
	(ins fencearg:$pred, fencearg:$succ),
	"fence", "$pred, $succ">, Sched<[]> {
	bits<4> pred;
	bits<4> succ;

	let rs1 = 0;
	let rd = 0;
	let imm12 = {0b0000,pred,succ};
	}

	def FENCE_TSO : RVInstI<0b000, OPC_MISC_MEM, (outs), (ins), "fence.tso", "">, Sched<[]> {
	let rs1 = 0;
	let rd = 0;
	let imm12 = {0b1000,0b0011,0b0011};
	}

	def FENCE_I : RVInstI<0b001, OPC_MISC_MEM, (outs), (ins), "fence.i", "">, Sched<[]> {
	let rs1 = 0;
	let rd = 0;
	let imm12 = 0;
	}

	def ECALL : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ecall", "">, Sched<[WriteJmp]> {
	let rs1 = 0;
	let rd = 0;
	let imm12 = 0;
	}

	def EBREAK : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ebreak", "">,
	Sched<[]> {
	let rs1 = 0;
	let rd = 0;
	let imm12 = 1;
	}

	// This is a de facto standard (as set by GNU binutils) 32-bit unimplemented
	// instruction (i.e., it should always trap, if your implementation has invalid
	// instruction traps).
	def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", "">,
	Sched<[]> {
	let rs1 = 0;
	let rd = 0;
	let imm12 = 0b110000000000;
	}
	} // hasSideEffects = 1, mayLoad = 0, mayStore = 0

	def CSRRW : CSR_ir<0b001, "csrrw">;
	def CSRRS : CSR_ir<0b010, "csrrs">;
	def CSRRC : CSR_ir<0b011, "csrrc">;

	def CSRRWI : CSR_ii<0b101, "csrrwi">;
	def CSRRSI : CSR_ii<0b110, "csrrsi">;
	def CSRRCI : CSR_ii<0b111, "csrrci">;

	/// RV64I instructions

	let Predicates = [IsRV64] in {
	def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDWU, ReadMemBase]>;
	def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
	def SD : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>;

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
	def ADDIW : RVInstI<0b000, OPC_OP_IMM_32, (outs GPR:$rd),
	(ins GPR:$rs1, simm12:$imm12),
	"addiw", "$rd, $rs1, $imm12">,
	Sched<[WriteIALU32, ReadIALU32]>;

	def SLLIW : ShiftW_ri<0, 0b001, "slliw">;
	def SRLIW : ShiftW_ri<0, 0b101, "srliw">;
	def SRAIW : ShiftW_ri<1, 0b101, "sraiw">;

	def ADDW : ALUW_rr<0b0000000, 0b000, "addw">,
	Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
	def SUBW : ALUW_rr<0b0100000, 0b000, "subw">,
	Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
	def SLLW : ALUW_rr<0b0000000, 0b001, "sllw">,
	Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
	def SRLW : ALUW_rr<0b0000000, 0b101, "srlw">,
	Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
	def SRAW : ALUW_rr<0b0100000, 0b101, "sraw">,
	Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
	} // Predicates = [IsRV64]

	//===----------------------------------------------------------------------===//
	// Privileged instructions
	//===----------------------------------------------------------------------===//

	let isBarrier = 1, isReturn = 1, isTerminator = 1 in {
	def URET : Priv<"uret", 0b0000000>, Sched<[]> {
	let rd = 0;
	let rs1 = 0;
	let rs2 = 0b00010;
	}

	def SRET : Priv<"sret", 0b0001000>, Sched<[]> {
	let rd = 0;
	let rs1 = 0;
	let rs2 = 0b00010;
	}

	def MRET : Priv<"mret", 0b0011000>, Sched<[]> {
	let rd = 0;
	let rs1 = 0;
	let rs2 = 0b00010;
	}
	} // isBarrier = 1, isReturn = 1, isTerminator = 1

	def WFI : Priv<"wfi", 0b0001000>, Sched<[]> {
	let rd = 0;
	let rs1 = 0;
	let rs2 = 0b00101;
	}

	let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
	def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
	(ins GPR:$rs1, GPR:$rs2),
	"sfence.vma", "$rs1, $rs2">, Sched<[]> {
	let rd = 0;
	}

	//===----------------------------------------------------------------------===//
	// Debug instructions
	//===----------------------------------------------------------------------===//

	let isBarrier = 1, isReturn = 1, isTerminator = 1 in {
	def DRET : Priv<"dret", 0b0111101>, Sched<[]> {
	let rd = 0;
	let rs1 = 0;
	let rs2 = 0b10010;
	}
	} // isBarrier = 1, isReturn = 1, isTerminator = 1

	//===----------------------------------------------------------------------===//
	// Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
	//===----------------------------------------------------------------------===//

	def : InstAlias<"nop", (ADDI X0, X0, 0)>;

	// Note that the size is 32 because up to 8 32-bit instructions are needed to
	// generate an arbitrary 64-bit immediate. However, the size does not really
	// matter since PseudoLI is currently only used in the AsmParser where it gets
	// expanded to real instructions immediately.
	let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 32,
	isCodeGenOnly = 0, isAsmParserOnly = 1 in
	def PseudoLI : Pseudo<(outs GPR:$rd), (ins ixlenimm_li:$imm), [],
	"li", "$rd, $imm">;

	def PseudoLB : PseudoLoad<"lb">;
	def PseudoLBU : PseudoLoad<"lbu">;
	def PseudoLH : PseudoLoad<"lh">;
	def PseudoLHU : PseudoLoad<"lhu">;
	def PseudoLW : PseudoLoad<"lw">;

	def PseudoSB : PseudoStore<"sb">;
	def PseudoSH : PseudoStore<"sh">;
	def PseudoSW : PseudoStore<"sw">;

	let Predicates = [IsRV64] in {
	def PseudoLWU : PseudoLoad<"lwu">;
	def PseudoLD : PseudoLoad<"ld">;
	def PseudoSD : PseudoStore<"sd">;
	} // Predicates = [IsRV64]

	def : InstAlias<"mv $rd, $rs", (ADDI GPR:$rd, GPR:$rs, 0)>;
	def : InstAlias<"not $rd, $rs", (XORI GPR:$rd, GPR:$rs, -1)>;
	def : InstAlias<"neg $rd, $rs", (SUB GPR:$rd, X0, GPR:$rs)>;

	let Predicates = [IsRV64] in {
	def : InstAlias<"negw $rd, $rs", (SUBW GPR:$rd, X0, GPR:$rs)>;
	def : InstAlias<"sext.w $rd, $rs", (ADDIW GPR:$rd, GPR:$rs, 0)>;
	} // Predicates = [IsRV64]

	def : InstAlias<"seqz $rd, $rs", (SLTIU GPR:$rd, GPR:$rs, 1)>;
	def : InstAlias<"snez $rd, $rs", (SLTU GPR:$rd, X0, GPR:$rs)>;
	def : InstAlias<"sltz $rd, $rs", (SLT GPR:$rd, GPR:$rs, X0)>;
	def : InstAlias<"sgtz $rd, $rs", (SLT GPR:$rd, X0, GPR:$rs)>;

	// sgt/sgtu are recognised by the GNU assembler but the canonical slt/sltu
	// form will always be printed. Therefore, set a zero weight.
	def : InstAlias<"sgt $rd, $rs, $rt", (SLT GPR:$rd, GPR:$rt, GPR:$rs), 0>;
	def : InstAlias<"sgtu $rd, $rs, $rt", (SLTU GPR:$rd, GPR:$rt, GPR:$rs), 0>;

	def : InstAlias<"beqz $rs, $offset",
	(BEQ GPR:$rs, X0, simm13_lsb0:$offset)>;
	def : InstAlias<"bnez $rs, $offset",
	(BNE GPR:$rs, X0, simm13_lsb0:$offset)>;
	def : InstAlias<"blez $rs, $offset",
	(BGE X0, GPR:$rs, simm13_lsb0:$offset)>;
	def : InstAlias<"bgez $rs, $offset",
	(BGE GPR:$rs, X0, simm13_lsb0:$offset)>;
	def : InstAlias<"bltz $rs, $offset",
	(BLT GPR:$rs, X0, simm13_lsb0:$offset)>;
	def : InstAlias<"bgtz $rs, $offset",
	(BLT X0, GPR:$rs, simm13_lsb0:$offset)>;

	// Always output the canonical mnemonic for the pseudo branch instructions.
	// The GNU tools emit the canonical mnemonic for the branch pseudo instructions
	// as well (e.g. "bgt" will be recognised by the assembler but never printed by
	// objdump). Match this behaviour by setting a zero weight.
	def : InstAlias<"bgt $rs, $rt, $offset",
	(BLT GPR:$rt, GPR:$rs, simm13_lsb0:$offset), 0>;
	def : InstAlias<"ble $rs, $rt, $offset",
	(BGE GPR:$rt, GPR:$rs, simm13_lsb0:$offset), 0>;
	def : InstAlias<"bgtu $rs, $rt, $offset",
	(BLTU GPR:$rt, GPR:$rs, simm13_lsb0:$offset), 0>;
	def : InstAlias<"bleu $rs, $rt, $offset",
	(BGEU GPR:$rt, GPR:$rs, simm13_lsb0:$offset), 0>;

	def : InstAlias<"j $offset", (JAL X0, simm21_lsb0_jal:$offset)>;
	def : InstAlias<"jal $offset", (JAL X1, simm21_lsb0_jal:$offset)>;

	// Non-zero offset aliases of "jalr" are the lowest weight, followed by the
	// two-register form, then the one-register forms and finally "ret".
	def : InstAlias<"jr $rs", (JALR X0, GPR:$rs, 0), 3>;
	def : InstAlias<"jr ${offset}(${rs})", (JALR X0, GPR:$rs, simm12:$offset)>;
	def : InstAlias<"jalr $rs", (JALR X1, GPR:$rs, 0), 3>;
	def : InstAlias<"jalr ${offset}(${rs})", (JALR X1, GPR:$rs, simm12:$offset)>;
	def : InstAlias<"jalr $rd, $rs", (JALR GPR:$rd, GPR:$rs, 0), 2>;
	def : InstAlias<"ret", (JALR X0, X1, 0), 4>;

	// Non-canonical forms for jump targets also accepted by the assembler.
	def : InstAlias<"jr $rs, $offset", (JALR X0, GPR:$rs, simm12:$offset), 0>;
	def : InstAlias<"jalr $rs, $offset", (JALR X1, GPR:$rs, simm12:$offset), 0>;
	def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12:$offset), 0>;

	def : InstAlias<"fence", (FENCE 0xF, 0xF)>; // 0xF == iorw

	def : InstAlias<"rdinstret $rd", (CSRRS GPR:$rd, INSTRET.Encoding, X0)>;
	def : InstAlias<"rdcycle $rd", (CSRRS GPR:$rd, CYCLE.Encoding, X0)>;
	def : InstAlias<"rdtime $rd", (CSRRS GPR:$rd, TIME.Encoding, X0)>;

	let Predicates = [IsRV32] in {
	def : InstAlias<"rdinstreth $rd", (CSRRS GPR:$rd, INSTRETH.Encoding, X0)>;
	def : InstAlias<"rdcycleh $rd", (CSRRS GPR:$rd, CYCLEH.Encoding, X0)>;
	def : InstAlias<"rdtimeh $rd", (CSRRS GPR:$rd, TIMEH.Encoding, X0)>;
	} // Predicates = [IsRV32]

	def : InstAlias<"csrr $rd, $csr", (CSRRS GPR:$rd, csr_sysreg:$csr, X0)>;
	def : InstAlias<"csrw $csr, $rs", (CSRRW X0, csr_sysreg:$csr, GPR:$rs)>;
	def : InstAlias<"csrs $csr, $rs", (CSRRS X0, csr_sysreg:$csr, GPR:$rs)>;
	def : InstAlias<"csrc $csr, $rs", (CSRRC X0, csr_sysreg:$csr, GPR:$rs)>;

	def : InstAlias<"csrwi $csr, $imm", (CSRRWI X0, csr_sysreg:$csr, uimm5:$imm)>;
	def : InstAlias<"csrsi $csr, $imm", (CSRRSI X0, csr_sysreg:$csr, uimm5:$imm)>;
	def : InstAlias<"csrci $csr, $imm", (CSRRCI X0, csr_sysreg:$csr, uimm5:$imm)>;

	let EmitPriority = 0 in {
	def : InstAlias<"csrw $csr, $imm", (CSRRWI X0, csr_sysreg:$csr, uimm5:$imm)>;
	def : InstAlias<"csrs $csr, $imm", (CSRRSI X0, csr_sysreg:$csr, uimm5:$imm)>;
	def : InstAlias<"csrc $csr, $imm", (CSRRCI X0, csr_sysreg:$csr, uimm5:$imm)>;

	def : InstAlias<"csrrw $rd, $csr, $imm", (CSRRWI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>;
	def : InstAlias<"csrrs $rd, $csr, $imm", (CSRRSI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>;
	def : InstAlias<"csrrc $rd, $csr, $imm", (CSRRCI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>;
	}

	def : InstAlias<"sfence.vma", (SFENCE_VMA X0, X0)>;
	def : InstAlias<"sfence.vma $rs", (SFENCE_VMA GPR:$rs, X0)>;

	let EmitPriority = 0 in {
	def : InstAlias<"lb $rd, (${rs1})",
	(LB GPR:$rd, GPR:$rs1, 0)>;
	def : InstAlias<"lh $rd, (${rs1})",
	(LH GPR:$rd, GPR:$rs1, 0)>;
	def : InstAlias<"lw $rd, (${rs1})",
	(LW GPR:$rd, GPR:$rs1, 0)>;
	def : InstAlias<"lbu $rd, (${rs1})",
	(LBU GPR:$rd, GPR:$rs1, 0)>;
	def : InstAlias<"lhu $rd, (${rs1})",
	(LHU GPR:$rd, GPR:$rs1, 0)>;

	def : InstAlias<"sb $rs2, (${rs1})",
	(SB GPR:$rs2, GPR:$rs1, 0)>;
	def : InstAlias<"sh $rs2, (${rs1})",
	(SH GPR:$rs2, GPR:$rs1, 0)>;
	def : InstAlias<"sw $rs2, (${rs1})",
	(SW GPR:$rs2, GPR:$rs1, 0)>;

	def : InstAlias<"add $rd, $rs1, $imm12",
	(ADDI GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	def : InstAlias<"and $rd, $rs1, $imm12",
	(ANDI GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	def : InstAlias<"xor $rd, $rs1, $imm12",
	(XORI GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	def : InstAlias<"or $rd, $rs1, $imm12",
	(ORI GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	def : InstAlias<"sll $rd, $rs1, $shamt",
	(SLLI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt)>;
	def : InstAlias<"srl $rd, $rs1, $shamt",
	(SRLI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt)>;
	def : InstAlias<"sra $rd, $rs1, $shamt",
	(SRAI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt)>;
	let Predicates = [IsRV64] in {
	def : InstAlias<"lwu $rd, (${rs1})",
	(LWU GPR:$rd, GPR:$rs1, 0)>;
	def : InstAlias<"ld $rd, (${rs1})",
	(LD GPR:$rd, GPR:$rs1, 0)>;
	def : InstAlias<"sd $rs2, (${rs1})",
	(SD GPR:$rs2, GPR:$rs1, 0)>;

	def : InstAlias<"addw $rd, $rs1, $imm12",
	(ADDIW GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	def : InstAlias<"sllw $rd, $rs1, $shamt",
	(SLLIW GPR:$rd, GPR:$rs1, uimm5:$shamt)>;
	def : InstAlias<"srlw $rd, $rs1, $shamt",
	(SRLIW GPR:$rd, GPR:$rs1, uimm5:$shamt)>;
	def : InstAlias<"sraw $rd, $rs1, $shamt",
	(SRAIW GPR:$rd, GPR:$rs1, uimm5:$shamt)>;
	} // Predicates = [IsRV64]
	def : InstAlias<"slt $rd, $rs1, $imm12",
	(SLTI GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	def : InstAlias<"sltu $rd, $rs1, $imm12",
	(SLTIU GPR:$rd, GPR:$rs1, simm12:$imm12)>;
	}

	def : MnemonicAlias<"move", "mv">;

	// The SCALL and SBREAK instructions wererenamed to ECALL and EBREAK in
	// version 2.1 of the user-level ISA. Like the GNU toolchain, we still accept
	// the old name for backwards compatibility.
	def : MnemonicAlias<"scall", "ecall">;
	def : MnemonicAlias<"sbreak", "ebreak">;

	//===----------------------------------------------------------------------===//
	// Pseudo-instructions and codegen patterns
	//
	// Naming convention: For 'generic' pattern classes, we use the naming
	// convention PatTy1Ty2. For pattern classes which offer a more complex
	// expansion, prefix the class name, e.g. BccPat.
	//===----------------------------------------------------------------------===//

	/// Generic pattern classes

	class PatGprGpr<SDPatternOperator OpNode, RVInst Inst>
	: Pat<(OpNode GPR:$rs1, GPR:$rs2), (Inst GPR:$rs1, GPR:$rs2)>;
	class PatGprSimm12<SDPatternOperator OpNode, RVInstI Inst>
	: Pat<(OpNode GPR:$rs1, simm12:$imm12), (Inst GPR:$rs1, simm12:$imm12)>;
	class PatGprUimmLog2XLen<SDPatternOperator OpNode, RVInstIShift Inst>
	: Pat<(OpNode GPR:$rs1, uimmlog2xlen:$shamt),
	(Inst GPR:$rs1, uimmlog2xlen:$shamt)>;

	/// Predicates

	def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
	return isOrEquivalentToAdd(N);
	}]>;
	def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{
	return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
	}]>;
	def sexti32 : PatFrags<(ops node:$src),
	[(sext_inreg node:$src, i32),
	(assertsexti32 node:$src)]>;
	def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
	return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
	}]>;
	def zexti32 : PatFrags<(ops node:$src),
	[(and node:$src, 0xffffffff),
	(assertzexti32 node:$src)]>;

	/// Immediates

	def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
	def : Pat<(simm32hi20:$imm), (LUI (HI20 imm:$imm))>;
	def : Pat<(simm32:$imm), (ADDI (LUI (HI20 imm:$imm)), (LO12Sext imm:$imm))>,
	Requires<[IsRV32]>;

	/// Simple arithmetic operations

	def : PatGprGpr<add, ADD>;
	def : PatGprSimm12<add, ADDI>;
	def : PatGprGpr<sub, SUB>;
	def : PatGprGpr<or, OR>;
	def : PatGprSimm12<or, ORI>;
	def : PatGprGpr<and, AND>;
	def : PatGprSimm12<and, ANDI>;
	def : PatGprGpr<xor, XOR>;
	def : PatGprSimm12<xor, XORI>;
	def : PatGprUimmLog2XLen<shl, SLLI>;
	def : PatGprUimmLog2XLen<srl, SRLI>;
	def : PatGprUimmLog2XLen<sra, SRAI>;

	// Match both a plain shift and one where the shift amount is masked (this is
	// typically introduced when the legalizer promotes the shift amount and
	// zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base
	// ISA only read the least significant 5 bits (RV32I) or 6 bits (RV64I).
	class shiftop<SDPatternOperator operator>
	: PatFrags<(ops node:$val, node:$count),
	[(operator node:$val, node:$count),
	(operator node:$val, (and node:$count, immbottomxlenset))]>;

	def : PatGprGpr<shiftop<shl>, SLL>;
	def : PatGprGpr<shiftop<srl>, SRL>;
	def : PatGprGpr<shiftop<sra>, SRA>;

	// This is a special case of the ADD instruction used to facilitate the use of a
	// fourth operand to emit a relocation on a symbol relating to this instruction.
	// The relocation does not affect any bits of the instruction itself but is used
	// as a hint to the linker.
	let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0 in
	def PseudoAddTPRel : Pseudo<(outs GPR:$rd),
	(ins GPR:$rs1, GPR:$rs2, tprel_add_symbol:$src), [],
	"add", "$rd, $rs1, $rs2, $src">;

	/// FrameIndex calculations

	def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12),
	(ADDI (i32 AddrFI:$Rs), simm12:$imm12)>;
	def : Pat<(IsOrAdd (i32 AddrFI:$Rs), simm12:$imm12),
	(ADDI (i32 AddrFI:$Rs), simm12:$imm12)>;

	/// Setcc

	def : PatGprGpr<setlt, SLT>;
	def : PatGprSimm12<setlt, SLTI>;
	def : PatGprGpr<setult, SLTU>;
	def : PatGprSimm12<setult, SLTIU>;

	// Define pattern expansions for setcc operations that aren't directly
	// handled by a RISC-V instruction.
	def : Pat<(seteq GPR:$rs1, 0), (SLTIU GPR:$rs1, 1)>;
	def : Pat<(seteq GPR:$rs1, GPR:$rs2), (SLTIU (XOR GPR:$rs1, GPR:$rs2), 1)>;
	def : Pat<(seteq GPR:$rs1, simm12_plus1:$imm12),
	(SLTIU (ADDI GPR:$rs1, (NegImm simm12_plus1:$imm12)), 1)>;
	def : Pat<(setne GPR:$rs1, 0), (SLTU X0, GPR:$rs1)>;
	def : Pat<(setne GPR:$rs1, GPR:$rs2), (SLTU X0, (XOR GPR:$rs1, GPR:$rs2))>;
	def : Pat<(setne GPR:$rs1, simm12_plus1:$imm12),
	(SLTU X0, (ADDI GPR:$rs1, (NegImm simm12_plus1:$imm12)))>;
	def : Pat<(setugt GPR:$rs1, GPR:$rs2), (SLTU GPR:$rs2, GPR:$rs1)>;
	def : Pat<(setuge GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>;
	def : Pat<(setule GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs2, GPR:$rs1), 1)>;
	def : Pat<(setgt GPR:$rs1, GPR:$rs2), (SLT GPR:$rs2, GPR:$rs1)>;
	def : Pat<(setge GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs1, GPR:$rs2), 1)>;
	def : Pat<(setle GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs2, GPR:$rs1), 1)>;

	let usesCustomInserter = 1 in
	class SelectCC_rrirr<RegisterClass valty, RegisterClass cmpty>
	: Pseudo<(outs valty:$dst),
	(ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm,
	valty:$truev, valty:$falsev),
	[(set valty:$dst, (riscv_selectcc cmpty:$lhs, cmpty:$rhs,
	(XLenVT imm:$imm), valty:$truev, valty:$falsev))]>;

	def Select_GPR_Using_CC_GPR : SelectCC_rrirr<GPR, GPR>;

	/// Branches and jumps

	// Match `(brcond (CondOp ..), ..)` and lower to the appropriate RISC-V branch
	// instruction.
	class BccPat<PatFrag CondOp, RVInstB Inst>
	: Pat<(brcond (XLenVT (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
	(Inst GPR:$rs1, GPR:$rs2, simm13_lsb0:$imm12)>;

	def : BccPat<seteq, BEQ>;
	def : BccPat<setne, BNE>;
	def : BccPat<setlt, BLT>;
	def : BccPat<setge, BGE>;
	def : BccPat<setult, BLTU>;
	def : BccPat<setuge, BGEU>;

	class BccSwapPat<PatFrag CondOp, RVInst InstBcc>
	: Pat<(brcond (XLenVT (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
	(InstBcc GPR:$rs2, GPR:$rs1, bb:$imm12)>;

	// Condition codes that don't have matching RISC-V branch instructions, but
	// are trivially supported by swapping the two input operands
	def : BccSwapPat<setgt, BLT>;
	def : BccSwapPat<setle, BGE>;
	def : BccSwapPat<setugt, BLTU>;
	def : BccSwapPat<setule, BGEU>;

	// An extra pattern is needed for a brcond without a setcc (i.e. where the
	// condition was calculated elsewhere).
	def : Pat<(brcond GPR:$cond, bb:$imm12), (BNE GPR:$cond, X0, bb:$imm12)>;

	let isBarrier = 1, isBranch = 1, isTerminator = 1 in
	def PseudoBR : Pseudo<(outs), (ins simm21_lsb0_jal:$imm20), [(br bb:$imm20)]>,
	PseudoInstExpansion<(JAL X0, simm21_lsb0_jal:$imm20)>;

	let isCall = 1, Defs=[X1] in
	let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
	def PseudoBRIND : Pseudo<(outs), (ins GPR:$rs1, simm12:$imm12), []>,
	PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>;

	def : Pat<(brind GPR:$rs1), (PseudoBRIND GPR:$rs1, 0)>;
	def : Pat<(brind (add GPR:$rs1, simm12:$imm12)),
	(PseudoBRIND GPR:$rs1, simm12:$imm12)>;

	// PseudoCALLReg is a generic pseudo instruction for calls which will eventually
	// expand to auipc and jalr while encoding, with any given register used as the
	// destination.
	// Define AsmString to print "call" when compile with -S flag.
	// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
	let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, hasSideEffects = 0,
	mayStore = 0, mayLoad = 0 in
	def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> {
	let AsmString = "call\t$rd, $func";
	}

	// PseudoCALL is a pseudo instruction which will eventually expand to auipc
	// and jalr while encoding. This is desirable, as an auipc+jalr pair with
	// R_RISCV_CALL and R_RISCV_RELAX relocations can be be relaxed by the linker
	// if the offset fits in a signed 21-bit immediate.
	// Define AsmString to print "call" when compile with -S flag.
	// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
	let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in
	def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> {
	let AsmString = "call\t$func";
	}

	def : Pat<(riscv_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
	def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;

	def : Pat<(riscv_uret_flag), (URET X0, X0)>;
	def : Pat<(riscv_sret_flag), (SRET X0, X0)>;
	def : Pat<(riscv_mret_flag), (MRET X0, X0)>;

	let isCall = 1, Defs = [X1] in
	def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1),
	[(riscv_call GPR:$rs1)]>,
	PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;

	let isBarrier = 1, isReturn = 1, isTerminator = 1 in
	def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_flag)]>,
	PseudoInstExpansion<(JALR X0, X1, 0)>;

	// PseudoTAIL is a pseudo instruction similar to PseudoCALL and will eventually
	// expand to auipc and jalr while encoding.
	// Define AsmString to print "tail" when compile with -S flag.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2],
	isCodeGenOnly = 0 in
	def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []> {
	let AsmString = "tail\t$dst";
	}

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in
	def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1),
	[(riscv_tail GPRTC:$rs1)]>,
	PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;

	def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
	(PseudoTAIL texternalsym:$dst)>;
	def : Pat<(riscv_tail (iPTR texternalsym:$dst)),
	(PseudoTAIL texternalsym:$dst)>;

	-let isCall = 0, isBarrier = 0, isCodeGenOnly = 0, hasSideEffects = 0,
	- mayStore = 0, mayLoad = 0 in
	+let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1,
	+ isCodeGenOnly = 0, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
	def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []> {
	let AsmString = "jump\t$target, $rd";
	}

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
	isAsmParserOnly = 1 in
	def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
	"lla", "$dst, $src">;

	let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
	isAsmParserOnly = 1 in
	def PseudoLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
	"la", "$dst, $src">;

	let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
	isAsmParserOnly = 1 in
	def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
	"la.tls.ie", "$dst, $src">;

	let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
	isAsmParserOnly = 1 in
	def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
	"la.tls.gd", "$dst, $src">;

	/// Loads

	multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
	def : Pat<(LoadOp GPR:$rs1), (Inst GPR:$rs1, 0)>;
	def : Pat<(LoadOp AddrFI:$rs1), (Inst AddrFI:$rs1, 0)>;
	def : Pat<(LoadOp (add GPR:$rs1, simm12:$imm12)),
	(Inst GPR:$rs1, simm12:$imm12)>;
	def : Pat<(LoadOp (add AddrFI:$rs1, simm12:$imm12)),
	(Inst AddrFI:$rs1, simm12:$imm12)>;
	def : Pat<(LoadOp (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
	(Inst AddrFI:$rs1, simm12:$imm12)>;
	}

	defm : LdPat<sextloadi8, LB>;
	defm : LdPat<extloadi8, LB>;
	defm : LdPat<sextloadi16, LH>;
	defm : LdPat<extloadi16, LH>;
	defm : LdPat<load, LW>, Requires<[IsRV32]>;
	defm : LdPat<zextloadi8, LBU>;
	defm : LdPat<zextloadi16, LHU>;

	/// Stores

	multiclass StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy> {
	def : Pat<(StoreOp StTy:$rs2, GPR:$rs1), (Inst StTy:$rs2, GPR:$rs1, 0)>;
	def : Pat<(StoreOp StTy:$rs2, AddrFI:$rs1), (Inst StTy:$rs2, AddrFI:$rs1, 0)>;
	def : Pat<(StoreOp StTy:$rs2, (add GPR:$rs1, simm12:$imm12)),
	(Inst StTy:$rs2, GPR:$rs1, simm12:$imm12)>;
	def : Pat<(StoreOp StTy:$rs2, (add AddrFI:$rs1, simm12:$imm12)),
	(Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
	def : Pat<(StoreOp StTy:$rs2, (IsOrAdd AddrFI:$rs1, simm12:$imm12)),
	(Inst StTy:$rs2, AddrFI:$rs1, simm12:$imm12)>;
	}

	defm : StPat<truncstorei8, SB, GPR>;
	defm : StPat<truncstorei16, SH, GPR>;
	defm : StPat<store, SW, GPR>, Requires<[IsRV32]>;

	/// Fences

	// Refer to Table A.6 in the version 2.3 draft of the RISC-V Instruction Set
	// Manual: Volume I.

	// fence acquire -> fence r, rw
	def : Pat<(atomic_fence (XLenVT 4), (timm)), (FENCE 0b10, 0b11)>;
	// fence release -> fence rw, w
	def : Pat<(atomic_fence (XLenVT 5), (timm)), (FENCE 0b11, 0b1)>;
	// fence acq_rel -> fence.tso
	def : Pat<(atomic_fence (XLenVT 6), (timm)), (FENCE_TSO)>;
	// fence seq_cst -> fence rw, rw
	def : Pat<(atomic_fence (XLenVT 7), (timm)), (FENCE 0b11, 0b11)>;

	// Lowering for atomic load and store is defined in RISCVInstrInfoA.td.
	// Although these are lowered to fence+load/store instructions defined in the
	// base RV32I/RV64I ISA, this lowering is only used when the A extension is
	// present. This is necessary as it isn't valid to mix __atomic_* libcalls
	// with inline atomic operations for the same object.

	/// Other pseudo-instructions

	// Pessimistically assume the stack pointer will be clobbered
	let Defs = [X2], Uses = [X2] in {
	def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
	[(callseq_start timm:$amt1, timm:$amt2)]>;
	def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
	[(callseq_end timm:$amt1, timm:$amt2)]>;
	} // Defs = [X2], Uses = [X2]

	/// RV64 patterns

	let Predicates = [IsRV64] in {

	/// sext and zext

	def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>;
	def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;

	/// ALU operations

	def : Pat<(sext_inreg (add GPR:$rs1, GPR:$rs2), i32),
	(ADDW GPR:$rs1, GPR:$rs2)>;
	def : Pat<(sext_inreg (add GPR:$rs1, simm12:$imm12), i32),
	(ADDIW GPR:$rs1, simm12:$imm12)>;
	def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32),
	(SUBW GPR:$rs1, GPR:$rs2)>;
	def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
	(SLLIW GPR:$rs1, uimm5:$shamt)>;
	// (srl (zexti32 ...), uimm5:$shamt) is matched with custom code due to the
	// need to undo manipulation of the mask value performed by DAGCombine.
	def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
	(SRAIW GPR:$rs1, uimm5:$shamt)>;

	def : PatGprGpr<riscv_sllw, SLLW>;
	def : PatGprGpr<riscv_srlw, SRLW>;
	def : PatGprGpr<riscv_sraw, SRAW>;

	/// Loads

	defm : LdPat<sextloadi32, LW>;
	defm : LdPat<extloadi32, LW>;
	defm : LdPat<zextloadi32, LWU>;
	defm : LdPat<load, LD>;

	/// Stores

	defm : StPat<truncstorei32, SW, GPR>;
	defm : StPat<store, SD, GPR>;
	} // Predicates = [IsRV64]

	/// readcyclecounter
	// On RV64, we can directly read the 64-bit "cycle" CSR.
	let Predicates = [IsRV64] in
	def : Pat<(readcyclecounter), (CSRRS CYCLE.Encoding, X0)>;
	// On RV32, ReadCycleWide will be expanded to the suggested loop reading both
	// halves of the 64-bit "cycle" CSR.
	let Predicates = [IsRV32], usesCustomInserter = 1, hasSideEffects = 0,
	mayLoad = 0, mayStore = 0, hasNoSchedulingInfo = 1 in
	def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins), [], "", "">;

	/// traps

	// We lower `trap` to `unimp`, as this causes a hard exception on nearly all
	// systems.
	def : Pat<(trap), (UNIMP)>;

	// We lower `debugtrap` to `ebreak`, as this will get the attention of the
	// debugger if possible.
	def : Pat<(debugtrap), (EBREAK)>;

	//===----------------------------------------------------------------------===//
	// Standard extensions
	//===----------------------------------------------------------------------===//

	include "RISCVInstrInfoM.td"
	include "RISCVInstrInfoA.td"
	include "RISCVInstrInfoF.td"
	include "RISCVInstrInfoD.td"
	include "RISCVInstrInfoC.td"
	include "RISCVInstrInfoB.td"
	include "RISCVInstrInfoV.td"
	diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
	index 86aa85e965f6..1671917157f4 100644
	--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
	+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
	@@ -1,50254 +1,50264 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "MCTargetDesc/X86ShuffleDecode.h"
	#include "X86.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/Analysis/ProfileSummaryInfo.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc(
	"Sets the preferable loop alignment for experiments (as log2 bytes)"
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	static cl::opt<bool> ExperimentalUnorderedISEL(
	"x86-experimental-unordered-atomic-isel", cl::init(false),
	cl::desc("Use LoadSDNode and StoreSDNode instead of "
	"AtomicSDNode for unordered atomic loads and "
	"stores respectively."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVCRT doesn't have powi; fall back to pow
	setLibcallName(RTLIB::POWI_F32, nullptr);
	setLibcallName(RTLIB::POWI_F64, nullptr);
	}

	// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
	// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
	// FIXME: Should we be limiting the atomic size on other configs? Default is
	// 1024.
	if (!Subtarget.hasCmpxchg8b())
	setMaxAtomicSizeInBitsSupported(32);

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
	setCondCodeAction(ISD::SETOEQ, VT, Expand);
	setCondCodeAction(ISD::SETUNE, VT, Expand);
	}

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	}
	setOperationAction(ISD::ABS , MVT::i64 , Custom);

	// Funnel shifts.
	for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
	// For slow shld targets we only lower for code size.
	LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

	setOperationAction(ShiftOp , MVT::i8 , Custom);
	setOperationAction(ShiftOp , MVT::i16 , Custom);
	setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
	if (Subtarget.is64Bit())
	setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
	}

	if (!Subtarget.useSoftFloat()) {
	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

	// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
	// SSE has no i16 to fp conversion, only i32. We promote in the handler
	// to allow f80 to use i16 and f64 to use i16 with sse1 only
	setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
	// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

	// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
	setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
	// FIXME: This doesn't generate invalid exception when it should. PR44019.
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

	setOperationAction(ISD::LRINT, MVT::f32, Custom);
	setOperationAction(ISD::LRINT, MVT::f64, Custom);
	setOperationAction(ISD::LLRINT, MVT::f32, Custom);
	setOperationAction(ISD::LLRINT, MVT::f64, Custom);

	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::LRINT, MVT::i64, Custom);
	setOperationAction(ISD::LLRINT, MVT::i64, Custom);
	}
	}

	// Handle address space casts between mixed sized pointers.
	setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
	setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FREM , MVT::f128 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::CTLZ , VT, Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
	}
	}

	for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
	ISD::STRICT_FP_TO_FP16}) {
	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	setOperationAction(
	Op, MVT::f32,
	(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
	// There's never any support for operations beyond MVT::f32.
	setOperationAction(Op, MVT::f64, Expand);
	setOperationAction(Op, MVT::f80, Expand);
	setOperationAction(Op, MVT::f128, Expand);
	}

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	else
	setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (!Subtarget.is64Bit())
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	// Disable f32->f64 extload as we can only generate this in one instruction
	// under optsize. So its easier to pattern match (fpext (load)) for that
	// case instead of needing to emit 2 instructions for extload in the
	// non-optsize case.
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::FSUB, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	} else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
	(UseX87 \|\| Is64Bit)) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	if (UseX87)
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	if (UseX87)
	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	if (UseX87)
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	if (UseX87) {
	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}

	// Expand FP32 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f32)) {
	if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	}
	// Expand FP64 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f64)) {
	if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	}
	// Handle constrained floating-point operations of scalar.
	setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// f80 always uses X87.
	if (UseX87) {
	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	setOperationAction(ISD::LROUND, MVT::f80, Expand);
	setOperationAction(ISD::LLROUND, MVT::f80, Expand);
	setOperationAction(ISD::LRINT, MVT::f80, Custom);
	setOperationAction(ISD::LLRINT, MVT::f80, Custom);

	// Handle constrained floating-point operations of scalar.
	setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
	// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
	// as Custom.
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
	}

	// f128 uses xmm registers, but most operations require libcalls.
	if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

	setOperationAction(ISD::FADD, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
	setOperationAction(ISD::FSUB, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
	setOperationAction(ISD::FDIV, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
	setOperationAction(ISD::FMUL, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
	setOperationAction(ISD::FMA, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

	setOperationAction(ISD::FABS, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

	setOperationAction(ISD::FSIN, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
	setOperationAction(ISD::FCOS, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
	setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
	// No STRICT_FSINCOS
	setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
	setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
	// We need to custom handle any FP_ROUND with an f128 input, but
	// LegalizeDAG uses the result type to know when to run a custom handler.
	// So we have to list all legal floating point result types here.
	if (isTypeLegal(MVT::f32)) {
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
	}
	if (isTypeLegal(MVT::f64)) {
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
	}
	if (isTypeLegal(MVT::f80)) {
	setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
	}

	setOperationAction(ISD::SETCC, MVT::f128, Custom);

	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);
	setOperationAction(ISD::FPOW , MVT::f128 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

	setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
	setOperationAction(ISD::STORE, MVT::v2f32, Custom);

	setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
	MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::SREM, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::UREM, VT, Custom);
	}

	setOperationAction(ISD::MUL, MVT::v2i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i8, Custom);
	setOperationAction(ISD::MUL, MVT::v8i8, Custom);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
	}

	setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ABS, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

	// Custom legalize these to avoid over promotion or custom promotion.
	for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
	}

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

	// We want to legalize this to an f64 load rather than an i64 load on
	// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
	// store.
	setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
	setOperationAction(ISD::STORE, MVT::v2i32, Custom);
	setOperationAction(ISD::STORE, MVT::v4i16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i8, Custom);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

	// With 512-bit registers or AVX512VL+BW, expanding (and promoting the
	// shifts) is better.
	if (!Subtarget.useAVX512Regs() &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX()))
	setOperationAction(ISD::ROTL, MVT::v16i8, Custom);

	setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::ADD, MVT::i16, Custom);
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::SUB, MVT::i16, Custom);
	setOperationAction(ISD::SUB, MVT::i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

	setOperationAction(ISD::FROUND, RoundedTy, Custom);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

	if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
	// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
	// do the pre and post work in the vector domain.
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
	// We need to mark SINT_TO_FP as Custom even though we want to expand it
	// so that DAG combine doesn't try to turn it into uint_to_fp.
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

	setOperationAction(ISD::FROUND, VT, Custom);

	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	// These types need custom splitting if their input is a 128-bit vector.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

	setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

	// With BWI, expanding (and promoting the shifts) is the better.
	if (!Subtarget.useBWIRegs())
	setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	}
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	setOperationAction(ISD::ABS, MVT::v4i64, Custom);
	setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
	setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

	setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	if (HasInt256) {
	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	// This block controls legalization of the mask vector sizes that are
	// available with AVX512. 512-bit vectors are in a separate block controlled
	// by useAVX512Regs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

	// There is no byte sized k-register load or store without AVX512DQ.
	if (!Subtarget.hasDQI()) {
	setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

	setOperationAction(ISD::STORE, MVT::v1i1, Custom);
	setOperationAction(ISD::STORE, MVT::v2i1, Custom);
	setOperationAction(ISD::STORE, MVT::v4i1, Custom);
	setOperationAction(ISD::STORE, MVT::v8i1, Custom);
	}

	// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// This block controls legalization for 512-bit operations with 32/64 bit
	// elements. 512-bits can be disabled based on prefer-vector-width and
	// required-vector-width function attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
	bool HasBWI = Subtarget.hasBWI();

	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	if (HasBWI)
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
	setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
	setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
	}
	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);

	setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
	if (HasBWI)
	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	if (!Subtarget.hasVLX()) {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	if (HasBWI) {
	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
	}

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

	setOperationAction(ISD::FROUND, VT, Custom);
	}

	for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);
	setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}
	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
	setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
	}

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
	}

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);

	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
	MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	}

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	if (HasBWI) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}
	} else {
	setOperationAction(ISD::STORE, MVT::v32i16, Custom);
	setOperationAction(ISD::STORE, MVT::v64i8, Custom);
	}

	if (Subtarget.hasVBMI2()) {
	for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}// useAVX512Regs

	// This block controls legalization for operations that don't have
	// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
	// narrower widths.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
	Subtarget.hasVLX() ? Legal : Custom);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
	}

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
	setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::UINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_SINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::FP_TO_UINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
	Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MUL, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	// This block control legalization of v32i1/v64i1 which are available with
	// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
	// useBWIRegs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	}

	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	// TODO: Make these legal even without VLX?
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() &&
	(Subtarget.isTargetWindowsMSVC() \|\| Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::STRICT_FCEIL,
	ISD::FCOS, ISD::STRICT_FCOS,
	ISD::FEXP, ISD::STRICT_FEXP,
	ISD::FFLOOR, ISD::STRICT_FFLOOR,
	ISD::FREM, ISD::STRICT_FREM,
	ISD::FLOG, ISD::STRICT_FLOG,
	ISD::FLOG10, ISD::STRICT_FLOG10,
	ISD::FPOW, ISD::STRICT_FPOW,
	ISD::FSIN, ISD::STRICT_FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::STRICT_FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
	setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);
	setTargetDAGCombine(ISD::FP16_TO_FP);
	setTargetDAGCombine(ISD::FP_EXTEND);
	setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
	setTargetDAGCombine(ISD::FP_ROUND);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(Align(16));

	verifyIntrinsicTables();

	// Default to having -disable-strictnode-mutation on
	IsStrictFPEnabled = true;
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(MVT VT) const {
	if ((VT == MVT::v32i1 \|\| VT == MVT::v64i1) && Subtarget.hasAVX512() &&
	!Subtarget.hasBWI())
	return TypeSplitVector;

	if (VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	static std::pair<MVT, unsigned>
	handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
	const X86Subtarget &Subtarget) {
	// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
	// convention is one that uses k registers.
	if (NumElts == 2)
	return {MVT::v2i64, 1};
	if (NumElts == 4)
	return {MVT::v4i32, 1};
	if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
	CC != CallingConv::Intel_OCL_BI)
	return {MVT::v8i16, 1};
	if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
	CC != CallingConv::Intel_OCL_BI)
	return {MVT::v16i8, 1};
	// v32i1 passes in ymm unless we have BWI and the calling convention is
	// regcall.
	if (NumElts == 32 && (!Subtarget.hasBWI() \|\| CC != CallingConv::X86_RegCall))
	return {MVT::v32i8, 1};
	// Split v64i1 vectors if we don't have v64i8 available.
	if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
	if (Subtarget.useAVX512Regs())
	return {MVT::v64i8, 1};
	return {MVT::v32i8, 2};
	}

	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (!isPowerOf2_32(NumElts) \|\| (NumElts == 64 && !Subtarget.hasBWI()) \|\|
	NumElts > 64)
	return {MVT::i8, NumElts};

	return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
	}

	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512()) {
	unsigned NumElts = VT.getVectorNumElements();

	MVT RegisterVT;
	unsigned NumRegisters;
	std::tie(RegisterVT, NumRegisters) =
	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	return RegisterVT;
	}

	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512()) {
	unsigned NumElts = VT.getVectorNumElements();

	MVT RegisterVT;
	unsigned NumRegisters;
	std::tie(RegisterVT, NumRegisters) =
	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	return NumRegisters;
	}

	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const {
	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	Subtarget.hasAVX512() &&
	(!isPowerOf2_32(VT.getVectorNumElements()) \|\|
	(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) \|\|
	VT.getVectorNumElements() > 64)) {
	RegisterVT = MVT::i8;
	IntermediateVT = MVT::i1;
	NumIntermediates = VT.getVectorNumElements();
	return NumIntermediates;
	}

	// Split v64i1 vectors if we don't have v64i8 available.
	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	CC != CallingConv::X86_RegCall) {
	RegisterVT = MVT::v32i8;
	IntermediateVT = MVT::v32i1;
	NumIntermediates = 2;
	return 2;
	}

	return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
	MaxAlign = Align(16);
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	Align EltAlign;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	Align EltAlign;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	Align TyAlign = DL.getABITypeAlign(Ty);
	if (TyAlign > 8)
	return TyAlign.value();
	return 8;
	}

	Align Alignment(4);
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Alignment);
	return Alignment.value();
	}

	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	/// For vector ops we check that the overall size isn't larger than our
	/// preferred vector width.
	EVT X86TargetLowering::getOptimalMemOpType(
	const MemOp &Op, const AttributeList &FuncAttributes) const {
	if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Op.size() >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\| Op.isAligned(Align(16)))) {
	// FIXME: Check if unaligned 64-byte accesses are slow.
	if (Op.size() >= 64 && Subtarget.hasAVX512() &&
	(Subtarget.getPreferVectorWidth() >= 512)) {
	return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
	}
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Op.size() >= 32 && Subtarget.hasAVX() &&
	(Subtarget.getPreferVectorWidth() >= 256)) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	// If we have SSE1 registers we should be able to use them.
	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
	(Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v4f32;
	} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) \|\| Op.isZeroMemset()) &&
	Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Op.size() >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// NonTemporal vector memory ops must be aligned.
	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
	// NT loads can only be vector aligned, so if its less aligned than the
	// minimum vector size (which we can split the vector down to), we might as
	// well use a regular unaligned vector load.
	// We don't have any NT loads pre-SSE41.
	if (!!(Flags & MachineMemOperand::MOLoad))
	return (Align < 16 \|\| !Subtarget.hasSSE41());
	return false;
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isIntOrPtrTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::X86_FastCall);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getGlobalVariable("__security_cookie");
	}
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getFunction("__security_check_cookie");
	}
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	const TargetMachine &TM = getTargetMachine();
	if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
	return false;

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	}

	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	}

	return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
	SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SmallVector<std::pair<Register, SDValue>, 4> RetVals;
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// Report an error if we have attempted to return a value via an XMM
	// register and SSE was disabled.
	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (!Subtarget.hasSSE2() &&
	X86::FR64XRegClass.contains(VA.getLocReg()) &&
	ValVT == MVT::f64) {
	// When returning a double via an XMM register, report an error if SSE2 is
	// not enabled.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
	Subtarget);

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}
	}

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (auto &RetVal : RetVals) {
	if (RetVal.first == X86::FP0 \|\| RetVal.first == X86::FP1) {
	RetOps.push_back(RetVal.second);
	continue; // Don't emit a copytoreg.
	}

	Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(
	DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	Register RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers.
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register.
	Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type.
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type.
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together.
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// Report an error if there was an attempt to return FP values via XMM
	// registers.
	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	if (VA.getLocReg() == X86::XMM1)
	VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
	else
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (!Subtarget.hasSSE2() &&
	X86::FR64XRegClass.contains(VA.getLocReg()) &&
	CopyVT == MVT::f64) {
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	if (VA.getLocReg() == X86::XMM1)
	VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
	else
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	if (VA.getLocInfo() == CCValAssign::BCvt)
	Val = DAG.getBitcast(VA.getValVT(), Val);

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(
	Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
	/isVolatile/ false, /AlwaysInline=/true,
	/isTailCall/ false, MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM \|\| CC == CallingConv::Tail);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	// Swift:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) \|\| CC == CallingConv::Tail;
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	if (!CI->isTailCall())
	return false;

	CallingConv::ID CalleeCC = CI->getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

	// FIXME: For now, all byval parameter objects are marked as aliasing. This
	// can be improved with deeper analysis.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
	/isAliased=/true);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	+ EVT ArgVT = Ins[i].ArgVT;
	+
	+ // If this is a vector that has been split into multiple parts, and the
	+ // scalar size of the parts don't match the vector element size, then we can't
	+ // elide the copy. The parts will have padding between them instead of being
	+ // packed like a vector.
	+ bool ScalarizedAndExtendedVector =
	+ ArgVT.isVector() && !VA.getLocVT().isVector() &&
	+ VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
	+
	// This is an argument in memory. We might be able to perform copy elision.
	// If the argument is passed directly in memory without any extension, then we
	// can perform copy elision. Large vector types, for example, may be passed
	// indirectly by pointer.
	if (Flags.isCopyElisionCandidate() &&
	- VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
	- EVT ArgVT = Ins[i].ArgVT;
	+ VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
	+ !ScalarizedAndExtendedVector) {
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/IsImmutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
	return llvm::is_sorted(
	ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	namespace {
	/// This is a helper class for lowering variable arguments parameters.
	class VarArgsLoweringHelper {
	public:
	VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	CallingConv::ID CallConv, CCState &CCInfo)
	: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
	TheMachineFunction(DAG.getMachineFunction()),
	TheFunction(TheMachineFunction.getFunction()),
	FrameInfo(TheMachineFunction.getFrameInfo()),
	FrameLowering(*Subtarget.getFrameLowering()),
	TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
	CCInfo(CCInfo) {}

	// Lower variable arguments parameters.
	void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

	private:
	void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

	void forwardMustTailParameters(SDValue &Chain);

	bool is64Bit() { return Subtarget.is64Bit(); }
	bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }

	X86MachineFunctionInfo *FuncInfo;
	const SDLoc &DL;
	SelectionDAG &DAG;
	const X86Subtarget &Subtarget;
	MachineFunction &TheMachineFunction;
	const Function &TheFunction;
	MachineFrameInfo &FrameInfo;
	const TargetFrameLowering &FrameLowering;
	const TargetLowering &TargLowering;
	CallingConv::ID CallConv;
	CCState &CCInfo;
	};
	} // namespace

	void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
	SDValue &Chain, unsigned StackSize) {
	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (is64Bit() \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall)) {
	FuncInfo->setVarArgsFrameIndex(
	FrameInfo.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (is64Bit()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs =
	get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	if (isWin64()) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
	}

	SmallVector<SDValue, 6>
	LiveGPRs; // list of SDValue for GPR registers keeping live input value
	SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
	// keeping live input value
	SDValue ALVal; // if applicable keeps SDValue for %al register

	// Gather all the live in physical registers.
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
	}
	const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
	if (!AvailableXmms.empty()) {
	Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
	for (MCPhysReg Reg : AvailableXmms) {
	Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
	}
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN =
	DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	TargLowering.getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, DL,
	TargLowering.getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, DL));
	SDValue Store =
	DAG.getStore(Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	// Now store the XMM (fp + vector) parameter registers.
	if (!LiveXMMRegs.empty()) {
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(
	DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
	SaveXMMOps.push_back(
	DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.useAVX512Regs() &&
	(is64Bit() \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Forward AL for SysV x86_64 targets, since it is used for varargs.
	if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
	Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &FR : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
	FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
	TargLowering.getRegClassFor(FR.VT));
	Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
	}
	}

	void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
	unsigned StackSize) {
	// Set FrameIndex to the 0xAAAAAAA value to mark unset state.
	// If necessary, it would be set into the correct value later.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

	if (FrameInfo.hasVAStart())
	createVarArgAreaAndStoreRegisters(Chain, StackSize);

	if (FrameInfo.hasMustTailInVarArgFunc())
	forwardMustTailParameters(Chain);
	}

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(IsVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, Align(8));

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i8)
	RC = &X86::GR8RegClass;
	else if (RegVT == MVT::i16)
	RC = &X86::GR16RegClass;
	else if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::VR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	Register Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	if (IsVarArg)
	VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
	.lowerVarArgsParameters(Chain, StackSize);

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, Align(8), /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<Register, Register> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags,
	bool isByVal) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (isByVal)
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt \|\|
	CallConv == CallingConv::Tail;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
	const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
	bool HasNoCfCheck =
	(CI && CI->doesNoCfCheck()) \|\| (II && II->doesNoCfCheck());
	const Module *M = MF.getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

	MachineFunction::CallSiteInfo CSInfo;
	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!IsGuaranteeTCO && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, Align(8));

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	} else if (CLI.IsPreallocated) {
	assert(ArgLocs.back().isMemLoc() &&
	"cannot use preallocated attribute on a register "
	"parameter");
	SmallVector<size_t, 4> PreallocatedOffsets;
	for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
	if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
	PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
	}
	}
	auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
	size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
	MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
	MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
	NumBytesToPush = 0;
	}

	if (!IsSibcall && !IsMustTail)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca/preallocated arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	if (isByVal) {
	// Memcpy the argument to a temporary stack slot to prevent
	// the caller from seeing any modifications the callee may make
	// as guaranteed by the `byval` attribute.
	int FrameIdx = MF.getFrameInfo().CreateStackObject(
	Flags.getByValSize(),
	std::max(Align(16), Flags.getNonZeroByValAlign()), false);
	SDValue StackSlot =
	DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
	Chain =
	CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
	// From now on treat this as a regular pointer
	Arg = StackSlot;
	isByVal = false;
	} else {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	}
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EmitCallSiteInfo)
	CSInfo.emplace_back(VA.getLocReg(), I);
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	Register ShadowReg;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags, isByVal));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");
	RegsToPass.push_back(std::make_pair(Register(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(F.PReg, Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca/preallocated arguments. They don't require any work.
	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress \|\|
	Callee->getOpcode() == ISD::ExternalSymbol) {
	// Lower direct calls to global addresses and external symbols. Setting
	// ForCall to true here has the effect of removing WrapperRIP when possible
	// to allow direct calls to be selected without first materializing the
	// address into a register.
	Callee = LowerGlobalOrExternal(Callee, DAG, /ForCall=/true);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall && !IsMustTail) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
	memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	if (HasNoCfCheck && IsCFProtectionSupported) {
	Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
	} else {
	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	}
	InFlag = Chain.getValue(1);
	DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	// Save heapallocsite metadata.
	if (CLI.CB)
	if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
	DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
	SelectionDAG &DAG) const {
	const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
	const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
	assert(StackSize % SlotSize == 0 &&
	"StackSize must be a multiple of SlotSize");
	return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!Register::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
	bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt \|\|
	CalleeCC == CallingConv::Tail;

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (IsGuaranteeTCO) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, Align(8));

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	Register Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::VALIGN:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::OR:
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	static bool isTargetShuffleSplat(SDValue Op) {
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::EXTRACT_SUBVECTOR)
	return isTargetShuffleSplat(Op.getOperand(0));
	return Opcode == X86ISD::VBROADCAST \|\| Opcode == X86ISD::VBROADCAST_LOAD;
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// Return true if the condition is an signed comparison operation.
	static bool isX86CCSigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return false;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return true;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
	// X >= 0 -> X == 0, jump on !sign.
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = Align(1);
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case GATHER:
	case GATHER_AVX2: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = Align(1);
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case SCATTER: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = Align(1);
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");

	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

	// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
	// those uses are extracted directly into a store, then the extract + store
	// can be store-folded. Therefore, it's probably not worth splitting the load.
	EVT VT = Load->getValueType(0);
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) && !Load->hasOneUse()) {
	for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
	// Skip uses of the chain value. Result 0 of the node is the load value.
	if (UI.getUse().getResNo() != 0)
	continue;

	// If this use is not an extract + store, it's probably worth splitting.
	if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\| !UI->hasOneUse() \|\|
	UI->use_begin()->getOpcode() != ISD::STORE)
	return true;
	}
	// All non-chain uses are extract + store.
	return false;
	}

	return true;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
	// If we are using XMM registers in the ABI and the condition of the select is
	// a floating-point compare and we have blendv or conditional move, then it is
	// cheaper to select instead of doing a cross-register move and creating a
	// load that depends on the compare result.
	bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
	return !IsFPSetCC \|\| !Subtarget.isTarget64BitLP64() \|\| !Subtarget.hasAVX();
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
	SDValue C) const {
	// TODO: We handle scalars using custom code, but generic combining could make
	// that unnecessary.
	APInt MulC;
	if (!ISD::isConstantSplatVector(C.getNode(), MulC))
	return false;

	// Find the type this will be legalized too. Otherwise we might prematurely
	// convert this to shl+add/sub and then still have to type legalize those ops.
	// Another choice would be to defer the decision for illegal types until
	// after type legalization. But constant splat vectors of i64 can't make it
	// through type legalization on 32-bit targets so we would need to special
	// case vXi64.
	while (getTypeAction(Context, VT) != TypeLegal)
	VT = getTypeToTransformTo(Context, VT);

	// If vector multiply is legal, assume that's faster than shl + add/sub.
	// TODO: Multiply is a complex op with higher latency and lower throughput in
	// most implementations, so this check could be loosened based on type
	// and/or a CPU attribute.
	if (isOperationLegal(ISD::MUL, VT))
	return false;

	// shl+add, shl+sub, shl+add+neg
	return (MulC + 1).isPowerOf2() \|\| (MulC - 1).isPowerOf2() \|\|
	(1 - MulC).isPowerOf2() \|\| (-(MulC + 1)).isPowerOf2();
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
	unsigned Opc = VecOp.getOpcode();

	// Assume target opcodes can't be scalarized.
	// TODO - do we have any exceptions?
	if (Opc >= ISD::BUILTIN_OP_END)
	return false;

	// If the vector op is not supported, try to convert to scalar.
	EVT VecVT = VecOp.getValueType();
	if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
	return true;

	// If the vector op is supported, but the scalar op is not, the transform may
	// not be worthwhile.
	EVT ScalarVT = VecVT.getScalarType();
	return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
	}

	bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
	bool) const {
	// TODO: Allow vectors?
	if (VT.isVector())
	return false;
	return VT.isSimple() \|\| !isOperationExpand(Opcode, VT);
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
	BitcastVT.getVectorElementType() == MVT::i1)
	return false;

	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
	return false;

	// If both types are legal vectors, it's always ok to convert them.
	if (LoadVT.isVector() && BitcastVT.isVector() &&
	isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
	return true;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	// Make sure we don't merge greater than our preferred vector
	// width.
	if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
	return false;
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (VT.isVector())
	return false;

	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return !isa<ConstantSDNode>(Y);
	}

	bool X86TargetLowering::hasAndNot(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (!VT.isVector())
	return hasAndNotCompare(Y);

	// Vector.

	if (!Subtarget.hasSSE1() \|\| VT.getSizeInBits() < 128)
	return false;

	if (VT == MVT::v4i32)
	return true;

	return Subtarget.hasSSE2();
	}

	bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
	return X.getValueType().isScalarInteger(); // 'bt'
	}

	bool X86TargetLowering::
	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
	SelectionDAG &DAG) const {
	// Does baseline recommend not to perform the fold by default?
	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
	return false;
	// For scalars this transform is always beneficial.
	if (X.getValueType().isScalarInteger())
	return true;
	// If all the shift amounts are identical, then transform is beneficial even
	// with rudimentary SSE2 shifts.
	if (DAG.isSplatValue(Y, /AllowUndefs=/true))
	return true;
	// If we have AVX2 with it's powerful shift operations, then it's also good.
	if (Subtarget.hasAVX2())
	return true;
	// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
	return NewShiftOpcode == ISD::SHL;
	}

	bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
	const SDNode *N, CombineLevel Level) const {
	assert(((N->getOpcode() == ISD::SHL &&
	N->getOperand(0).getOpcode() == ISD::SRL) \|\|
	(N->getOpcode() == ISD::SRL &&
	N->getOperand(0).getOpcode() == ISD::SHL)) &&
	"Expected shift-shift mask");
	EVT VT = N->getValueType(0);
	if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) \|\|
	(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
	// Only fold if the shift values are equal - so it folds to AND.
	// TODO - we should fold if either is a non-uniform vector but we don't do
	// the fold for non-splats yet.
	return N->getOperand(1) == N->getOperand(0).getOperand(1);
	}
	return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
	}

	bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
	EVT VT = Y.getValueType();

	// For vectors, we don't have a preference, but we probably want a mask.
	if (VT.isVector())
	return false;

	// 64-bit shifts on 32-bit targets produce really bad bloated code.
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	return false;

	return true;
	}

	bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	SDNode *N) const {
	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	!Subtarget.isOSWindows())
	return false;
	return true;
	}

	bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
	// Any legal vector type can be splatted more efficiently than
	// loading/spilling from memory.
	return isTypeLegal(VT);
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning from position Pos and ending
	/// in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	return llvm::all_of(Mask.slice(Pos, Size),
	[](int M) { return M == SM_SentinelUndef; });
	}

	/// Return true if the mask creates a vector whose lower half is undefined.
	static bool isUndefLowerHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, 0, NumElts / 2);
	}

	/// Return true if the mask creates a vector whose upper half is undefined.
	static bool isUndefUpperHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
	}

	/// Return true if Val falls within the specified range (L, H].
	static bool isInRange(int Val, int Low, int Hi) {
	return (Val >= Low && Val < Hi);
	}

	/// Return true if the value of any element in Mask falls within the specified
	/// range (L, H].
	static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
	}

	/// Return true if the value of any element in Mask is the zero sentinel value.
	static bool isAnyZero(ArrayRef<int> Mask) {
	return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
	}

	/// Return true if the value of any element in Mask is the zero or undef
	/// sentinel values.
	static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
	return llvm::any_of(Mask, [](int M) {
	return M == SM_SentinelZero \|\| M == SM_SentinelUndef;
	});
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::all_of(
	Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	return llvm::all_of(
	Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos + Size, falls within the specified
	/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low, int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low,
	int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	return llvm::all_of(Mask.slice(Pos, Size),
	[](int M) { return isUndefOrZero(M); });
	}

	/// Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool V2IsZero,
	SmallVectorImpl<int> &WidenedMask) {
	// Create an alternative mask with info about zeroable elements.
	// Here we do not set undef elements as zeroable.
	SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
	if (V2IsZero) {
	assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
	for (int i = 0, Size = Mask.size(); i != Size; ++i)
	if (Mask[i] != SM_SentinelUndef && Zeroable[i])
	ZeroableMask[i] = SM_SentinelZero;
	}
	return canWidenShuffleElements(ZeroableMask, WidenedMask);
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask) {
	SmallVector<int, 32> WidenedMask;
	return canWidenShuffleElements(Mask, WidenedMask);
	}

	// Attempt to narrow/widen shuffle mask until it matches the target number of
	// elements.
	static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
	SmallVectorImpl<int> &ScaledMask) {
	unsigned NumSrcElts = Mask.size();
	assert(((NumSrcElts % NumDstElts) == 0 \|\| (NumDstElts % NumSrcElts) == 0) &&
	"Illegal shuffle scale factor");

	// Narrowing is guaranteed to work.
	if (NumDstElts >= NumSrcElts) {
	int Scale = NumDstElts / NumSrcElts;
	llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
	return true;
	}

	// We have to repeat the widening until we reach the target size, but we can
	// split out the first widening as it sets up ScaledMask for us.
	if (canWidenShuffleElements(Mask, ScaledMask)) {
	while (ScaledMask.size() > NumDstElts) {
	SmallVector<int, 16> WidenedMask;
	if (!canWidenShuffleElements(ScaledMask, WidenedMask))
	return false;
	ScaledMask = std::move(WidenedMask);
	}
	return true;
	}

	return false;
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.isFloatingPoint()) {
	Vec = DAG.getConstantFP(+0.0, dl, VT);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
	Vec.getValueType().getScalarType() == VT.getScalarType() &&
	"Unsupported vector widening type");
	SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getUNDEF(VT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl, unsigned WideSizeInBits) {
	assert(Vec.getValueSizeInBits() < WideSizeInBits &&
	(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
	"Unsupported vector widening type");
	unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
	MVT SVT = Vec.getSimpleValueType().getScalarType();
	MVT VT = MVT::getVectorVT(SVT, WideNumElts);
	return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
	}

	// Helper function to collect subvector ops that are concatenated together,
	// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
	// The subvectors in Ops are guaranteed to be the same type.
	static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
	assert(Ops.empty() && "Expected an empty ops vector");

	if (N->getOpcode() == ISD::CONCAT_VECTORS) {
	Ops.append(N->op_begin(), N->op_end());
	return true;
	}

	if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Src = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	const APInt &Idx = N->getConstantOperandAPInt(2);
	EVT VT = Src.getValueType();
	EVT SubVT = Sub.getValueType();

	// TODO - Handle more general insert_subvector chains.
	if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
	Idx == (VT.getVectorNumElements() / 2)) {
	// insert_subvector(insert_subvector(undef, x, lo), y, hi)
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(1).getValueType() == SubVT &&
	isNullConstant(Src.getOperand(2))) {
	Ops.push_back(Src.getOperand(1));
	Ops.push_back(Sub);
	return true;
	}
	// insert_subvector(x, extract_subvector(x, lo), hi)
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
	Ops.append(2, Sub);
	return true;
	}
	}
	}

	return false;
	}

	static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) {
	EVT VT = Op.getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
	"Can't split odd sized vector");

	SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
	return std::make_pair(Lo, Hi);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Make sure we only try to split 256/512-bit types to avoid creating
	// narrow vectors.
	assert((Op.getOperand(0).getValueType().is256BitVector() \|\|
	Op.getOperand(0).getValueType().is512BitVector()) &&
	(VT.is256BitVector() \|\| VT.is512BitVector()) && "Unsupported VT!");
	assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
	VT.getVectorNumElements() &&
	"Unexpected VTs!");

	SDLoc dl(Op);

	// Extract the Lo/Hi vectors
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
	}

	/// Break a binary integer operation into 2 half sized ops and then
	/// concatenate the result back.
	static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Sanity check that all the types match.
	assert(Op.getOperand(0).getValueType() == VT &&
	Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) && "Unsupported VT!");

	SDLoc dl(Op);

	// Extract the LHS Lo/Hi vectors
	SDValue LHS1, LHS2;
	std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

	// Extract the RHS Lo/Hi vectors
	SDValue RHS1, RHS2;
	std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
	}

	// Helper for splitting operands of an operation to legal target size and
	// apply a function on each part.
	// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
	// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
	// deciding if/how to split Ops. Ops elements do not have to be of type VT.
	// The argument Builder is a function that will be applied on each split part:
	// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
	template <typename F>
	SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
	const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
	F Builder, bool CheckBWI = true) {
	assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
	unsigned NumSubs = 1;
	if ((CheckBWI && Subtarget.useBWIRegs()) \|\|
	(!CheckBWI && Subtarget.useAVX512Regs())) {
	if (VT.getSizeInBits() > 512) {
	NumSubs = VT.getSizeInBits() / 512;
	assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
	}
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256) {
	NumSubs = VT.getSizeInBits() / 256;
	assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
	}
	} else {
	if (VT.getSizeInBits() > 128) {
	NumSubs = VT.getSizeInBits() / 128;
	assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
	}
	}

	if (NumSubs == 1)
	return Builder(DAG, DL, Ops);

	SmallVector<SDValue, 4> Subs;
	for (unsigned i = 0; i != NumSubs; ++i) {
	SmallVector<SDValue, 2> SubOps;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
	unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
	SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
	}
	Subs.push_back(Builder(DAG, DL, SubOps));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	unsigned IdxVal = Op.getConstantOperandVal(2);

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to optimize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	DAG.getConstant(0, dl, WideOpVT),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

	// Do an optimization for the the most frequently used types.
	if (WideOpVT != MVT::v64i1 \|\| Subtarget.is64Bit()) {
	APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
	Mask0.flipAllBits();
	SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
	SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
	Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Clear the upper bits of the subvector and move it to its insert position.
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

	// Isolate the bits below the insertion point.
	unsigned LowShift = NumElems - IdxVal;
	SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
	DAG.getTargetConstant(LowShift, dl, MVT::i8));
	Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
	DAG.getTargetConstant(LowShift, dl, MVT::i8));

	// Isolate the bits after the last inserted bit.
	unsigned HighShift = IdxVal + SubVecNumElems;
	SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getTargetConstant(HighShift, dl, MVT::i8));
	High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
	DAG.getTargetConstant(HighShift, dl, MVT::i8));

	// Now OR all 3 pieces together.
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
	SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
	EVT SubVT = V1.getValueType();
	EVT SubSVT = SubVT.getScalarType();
	unsigned SubNumElts = SubVT.getVectorNumElements();
	unsigned SubVectorWidth = SubVT.getSizeInBits();
	EVT VT = EVT::getVectorVT(DAG.getContext(), SubSVT, 2 SubNumElts);
	SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
	return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	// Convert _EXTEND to _EXTEND_VECTOR_INREG opcode.
	static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return ISD::ANY_EXTEND_VECTOR_INREG;
	case ISD::ZERO_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return ISD::ZERO_EXTEND_VECTOR_INREG;
	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return ISD::SIGN_EXTEND_VECTOR_INREG;
	}
	llvm_unreachable("Unknown opcode");
	}

	static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue In, SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
	assert((ISD::ANY_EXTEND == Opcode \|\| ISD::SIGN_EXTEND == Opcode \|\|
	ISD::ZERO_EXTEND == Opcode) &&
	"Unknown extension opcode");

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (InVT.getSizeInBits() > 128) {
	assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
	"Expected VTs to be the same size!");
	unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
	InVT = In.getValueType();
	}

	if (VT.getVectorNumElements() != InVT.getVectorNumElements())
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

	return DAG.getNode(Opcode, DL, VT, In);
	}

	// Match (xor X, -1) -> X.
	// Match extract_subvector(xor X, -1) -> extract_subvector(X).
	// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
	static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
	V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
	if (V.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
	return V.getOperand(0);
	if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	(isNullConstant(V.getOperand(1)) \|\| V.getOperand(0).hasOneUse())) {
	if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
	Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
	Not, V.getOperand(1));
	}
	}
	SmallVector<SDValue, 2> CatOps;
	if (collectConcatOps(V.getNode(), CatOps)) {
	for (SDValue &CatOp : CatOps) {
	SDValue NotCat = IsNOT(CatOp, DAG);
	if (!NotCat) return SDValue();
	CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
	}
	return SDValue();
	}

	void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Lo, bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();
	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
	/// imposed by AVX and specific to the unary pattern. Example:
	/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
	/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
	void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Lo) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	for (int i = 0; i < NumElts; ++i) {
	int Pos = i / 2;
	Pos += (Lo ? 0 : NumElts / 2);
	Mask.push_back(Pos);
	}
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
	if (Ptr.getOpcode() == X86ISD::Wrapper \|\|
	Ptr.getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr.getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry() \|\| CNode->getOffset() != 0)
	return nullptr;

	return CNode->getConstVal();
	}

	static const Constant getTargetConstantFromNode(LoadSDNode Load) {
	if (!Load \|\| !ISD::isNormalLoad(Load))
	return nullptr;
	return getTargetConstantFromBasePtr(Load->getBasePtr());
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);
	return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
	}

	const Constant *
	X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
	assert(LD && "Unexpected null LoadSDNode");
	return getTargetConstantFromNode(LD);
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SmallVector<APInt, 64> SrcEltBits(1, RawBits);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantFPSDNode>(Src);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
	if (!CstTy->isVectorTy() \|\| (CstSizeInBits % SizeInBits) != 0)
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
	if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
	return false;

	SDValue Ptr = MemIntr->getBasePtr();
	if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
	unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract constant bits from a subvector broadcast.
	if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
	SmallVector<APInt, 16> SubEltBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, SubEltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	UndefElts = APInt::getSplat(NumElts, UndefElts);
	while (EltBits.size() < NumElts)
	EltBits.append(SubEltBits.begin(), SubEltBits.end());
	return true;
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Insert constant bits from a base and sub vector sources.
	if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
	// TODO - support insert_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	APInt UndefSubElts;
	SmallVector<APInt, 32> EltSubBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefSubElts, EltSubBits,
	AllowWholeUndefs, AllowPartialUndefs) &&
	getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	unsigned BaseIdx = Op.getConstantOperandVal(2);
	UndefElts.insertBits(UndefSubElts, BaseIdx);
	for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
	EltBits[BaseIdx + i] = EltSubBits[i];
	return true;
	}
	}

	// Extract constant bits from a subvector's source.
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	// TODO - support extract_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	EVT SrcVT = Op.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = VT.getVectorNumElements();
	unsigned BaseIdx = Op.getConstantOperandVal(1);
	UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
	if ((BaseIdx + NumSubElts) != NumSrcElts)
	EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
	if (BaseIdx != 0)
	EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
	return true;
	}
	}

	// Extract constant bits from shuffle node sources.
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
	// TODO - support shuffle through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	ArrayRef<int> Mask = SVN->getMask();
	if ((!AllowWholeUndefs \|\| !AllowPartialUndefs) &&
	llvm::any_of(Mask, [](int M) { return M < 0; }))
	return false;

	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (isAnyInRange(Mask, 0, NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts0, EltBits0, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;
	if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefElts1, EltBits1, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;

	UndefElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];
	if (M < 0) {
	UndefElts.setBit(i);
	EltBits.push_back(APInt::getNullValue(EltSizeInBits));
	} else if (M < (int)NumElts) {
	if (UndefElts0[M])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits0[M]);
	} else {
	if (UndefElts1[M - NumElts])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits1[M - NumElts]);
	}
	}
	return true;
	}

	return false;
	}

	namespace llvm {
	namespace X86 {
	bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
	UndefElts, EltBits, true,
	AllowPartialUndefs)) {
	int SplatIndex = -1;
	for (int i = 0, e = EltBits.size(); i != e; ++i) {
	if (UndefElts[i])
	continue;
	if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
	SplatIndex = -1;
	break;
	}
	SplatIndex = i;
	}
	if (0 <= SplatIndex) {
	SplatVal = EltBits[SplatIndex];
	return true;
	}
	}

	return false;
	}
	} // namespace X86
	} // namespace llvm

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask,
	APInt &UndefElts) {
	// Extract the raw target constant bits.
	SmallVector<APInt, 64> EltBits;
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary, unsigned NumStages = 1) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;
	unsigned Repetitions = 1u << (NumStages - 1);
	unsigned Increment = 1u << NumStages;
	assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}
	}

	// Split the demanded elts of a PACKSS/PACKUS node between its operands.
	static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumInnerElts = NumElts / 2;
	int NumEltsPerLane = NumElts / NumLanes;
	int NumInnerEltsPerLane = NumInnerElts / NumLanes;

	DemandedLHS = APInt::getNullValue(NumInnerElts);
	DemandedRHS = APInt::getNullValue(NumInnerElts);

	// Map DemandedElts to the packed operands.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
	int OuterIdx = (Lane * NumEltsPerLane) + Elt;
	int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
	if (DemandedElts[OuterIdx])
	DemandedLHS.setBit(InnerIdx);
	if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
	DemandedRHS.setBit(InnerIdx);
	}
	}
	}

	// Split the demanded elts of a HADD/HSUB node between its operands.
	static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumEltsPerLane = NumElts / NumLanes;
	int HalfEltsPerLane = NumEltsPerLane / 2;

	DemandedLHS = APInt::getNullValue(NumElts);
	DemandedRHS = APInt::getNullValue(NumElts);

	// Map DemandedElts to the horizontal operands.
	for (int Idx = 0; Idx != NumElts; ++Idx) {
	if (!DemandedElts[Idx])
	continue;
	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
	int LocalIdx = Idx % NumEltsPerLane;
	if (LocalIdx < HalfEltsPerLane) {
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	} else {
	LocalIdx -= HalfEltsPerLane;
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	}
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	APInt RawUndefs;
	uint64_t ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch (N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeBLENDMask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeINSERTPSMask(ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::VALIGN:
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeVALIGNMask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePALIGNRMask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSLLDQMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSRLDQMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSHUFHWMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodePSHUFLWMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST:
	// We only decode broadcasts of same-sized vectors, peeking through to
	// extracted subvectors is likely to cause hasOneUse issues with
	// SimplifyDemandedBits etc.
	if (N->getOperand(0).getValueType() == VT) {
	DecodeVectorBroadcast(NumElems, Mask);
	IsUnary = true;
	break;
	}
	return false;
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodePSHUFBMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeVPERMMask(NumElems, ImmN, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUF128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
	decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
	Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodeVPPERMMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMVMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero && isAnyZero(Mask))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static void computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2,
	APInt &KnownUndef, APInt &KnownZero) {
	int Size = Mask.size();
	KnownUndef = KnownZero = APInt::getNullValue(Size);

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Size;
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0; i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0) {
	KnownUndef.setBit(i);
	continue;
	}
	if ((M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	KnownZero.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef())
	KnownUndef.setBit(i);
	if (X86::isZeroNode(Op))
	KnownZero.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
	if (Val == 0)
	KnownZero.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
	if (Val == 0)
	KnownZero.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllUndef = true;
	bool AllZero = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllUndef &= Op.isUndef();
	AllZero &= X86::isZeroNode(Op);
	}
	if (AllUndef)
	KnownUndef.setBit(i);
	if (AllZero)
	KnownZero.setBit(i);
	continue;
	}
	}
	}

	/// Decode a target shuffle mask and inputs and see if any values are
	/// known to be undef or zero from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	/// FIXME: Merge this with computeZeroableShuffleElements?
	static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	APInt &KnownUndef, APInt &KnownZero) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	int Size = Mask.size();
	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];
	KnownUndef = KnownZero = APInt::getNullValue(Size);

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Size) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Size;

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0; i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0) {
	assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
	if (SM_SentinelUndef == M)
	KnownUndef.setBit(i);
	if (SM_SentinelZero == M)
	KnownZero.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	KnownUndef.setBit(i);
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	KnownUndef.setBit(i);
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	KnownZero.setBit(i);
	continue;
	}

	// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
	// base vectors.
	if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Vec = V.getOperand(0);
	int NumVecElts = Vec.getValueType().getVectorNumElements();
	if (Vec.isUndef() && Size == NumVecElts) {
	int Idx = V.getConstantOperandVal(2);
	int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
	if (M < Idx \|\| (Idx + NumSubElts) <= M)
	KnownUndef.setBit(i);
	}
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	KnownUndef.setBit(i);
	else if (SrcEltBits[SrcIdx][M] == 0)
	KnownZero.setBit(i);
	}
	}

	assert(VT.getVectorNumElements() == (unsigned)Size &&
	"Different mask size from vector size!");
	return true;
	}

	// Replace target shuffle mask elements with known undef/zero sentinels.
	static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
	const APInt &KnownUndef,
	const APInt &KnownZero,
	bool ResolveKnownZeros= true) {
	unsigned NumElts = Mask.size();
	assert(KnownUndef.getBitWidth() == NumElts &&
	KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");

	for (unsigned i = 0; i != NumElts; ++i) {
	if (KnownUndef[i])
	Mask[i] = SM_SentinelUndef;
	else if (ResolveKnownZeros && KnownZero[i])
	Mask[i] = SM_SentinelZero;
	}
	}

	// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
	static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
	APInt &KnownUndef,
	APInt &KnownZero) {
	unsigned NumElts = Mask.size();
	KnownUndef = KnownZero = APInt::getNullValue(NumElts);

	for (unsigned i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (SM_SentinelUndef == M)
	KnownUndef.setBit(i);
	if (SM_SentinelZero == M)
	KnownZero.setBit(i);
	}
	}

	// Forward declaration (for getFauxShuffleMask recursive check).
	// TODO: Use DemandedElts variant.
	static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	const SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts);

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	const SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	if ((NumBitsPerElt % 8) != 0 \|\| (NumSizeInBits % 8) != 0)
	return false;
	assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
	unsigned NumSizeInBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::VECTOR_SHUFFLE: {
	// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
	if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
	Mask.append(ShuffleMask.begin(), ShuffleMask.end());
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	return false;
	}
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	const APInt &ByteBits = EltBits[i];
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::OR: {
	// Inspect each operand at the byte level. We can merge these into a
	// blend shuffle mask if for each byte at least one is masked out (zero).
	KnownBits Known0 =
	DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
	KnownBits Known1 =
	DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
	if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
	bool IsByteMask = true;
	APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
	APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
	for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
	unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
	unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
	if (LHS == 255 && RHS == 0)
	SelectMask.setBit(i);
	else if (LHS == 255 && RHS == 255)
	ZeroMask.setBit(i);
	else if (!(LHS == 0 && RHS == 255))
	IsByteMask = false;
	}
	if (IsByteMask) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
	for (unsigned j = 0; j != NumBytesPerElt; ++j) {
	unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
	int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
	Mask.push_back(Idx);
	}
	}
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	}

	// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
	// is a valid shuffle index.
	SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
	SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
	if (!N0.getValueType().isVector() \|\| !N1.getValueType().isVector())
	return false;
	SmallVector<int, 64> SrcMask0, SrcMask1;
	SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
	if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
	true) \|\|
	!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
	true))
	return false;

	// Shuffle inputs must be the same size as the result.
	if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
	return VT.getSizeInBits() != Op.getValueSizeInBits();
	}))
	return false;
	if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
	return VT.getSizeInBits() != Op.getValueSizeInBits();
	}))
	return false;

	size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
	SmallVector<int, 64> Mask0, Mask1;
	narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
	narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
	for (size_t i = 0; i != MaskSize; ++i) {
	if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
	Mask.push_back(SM_SentinelUndef);
	else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
	Mask.push_back(SM_SentinelZero);
	else if (Mask1[i] == SM_SentinelZero)
	Mask.push_back(Mask0[i]);
	else if (Mask0[i] == SM_SentinelZero)
	Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
	else
	return false;
	}
	Ops.append(SrcInputs0.begin(), SrcInputs0.end());
	Ops.append(SrcInputs1.begin(), SrcInputs1.end());
	return true;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Src = N.getOperand(0);
	SDValue Sub = N.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	if (!N->isOnlyUserOf(Sub.getNode()))
	return false;
	uint64_t InsertIdx = N.getConstantOperandVal(2);
	// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0).getValueType() == VT) {
	uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i)
	Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
	Ops.push_back(Src);
	Ops.push_back(Sub.getOperand(0));
	return true;
	}
	// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
	SmallVector<int, 64> SubMask;
	SmallVector<SDValue, 2> SubInputs;
	if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
	SubMask, DAG, Depth + 1, ResolveKnownElts))
	return false;

	// Subvector shuffle inputs must not be larger than the subvector.
	if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
	return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
	}))
	return false;

	if (SubMask.size() != NumSubElts) {
	assert(((SubMask.size() % NumSubElts) == 0 \|\|
	(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
	if ((NumSubElts % SubMask.size()) == 0) {
	int Scale = NumSubElts / SubMask.size();
	SmallVector<int,64> ScaledSubMask;
	narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
	SubMask = ScaledSubMask;
	} else {
	int Scale = SubMask.size() / NumSubElts;
	NumSubElts = SubMask.size();
	NumElts *= Scale;
	InsertIdx *= Scale;
	}
	}
	Ops.push_back(Src);
	Ops.append(SubInputs.begin(), SubInputs.end());
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i) {
	int M = SubMask[i];
	if (0 <= M) {
	int InputIdx = M / NumSubElts;
	M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
	}
	Mask[i + InsertIdx] = M;
	}
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW:
	case ISD::SCALAR_TO_VECTOR:
	case ISD::INSERT_VECTOR_ELT: {
	// Match against a insert_vector_elt/scalar_to_vector of an extract from a
	// vector, for matching src/dst vector types.
	SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

	unsigned DstIdx = 0;
	if (Opcode != ISD::SCALAR_TO_VECTOR) {
	// Check we have an in-range constant insertion index.
	if (!isa<ConstantSDNode>(N.getOperand(2)) \|\|
	N.getConstantOperandAPInt(2).uge(NumElts))
	return false;
	DstIdx = N.getConstantOperandVal(2);

	// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
	if (X86::isZeroNode(Scl)) {
	Ops.push_back(N.getOperand(0));
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
	return true;
	}
	}

	// Peek through trunc/aext/zext.
	// TODO: aext shouldn't require SM_SentinelZero padding.
	// TODO: handle shift of scalars.
	unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
	while (Scl.getOpcode() == ISD::TRUNCATE \|\|
	Scl.getOpcode() == ISD::ANY_EXTEND \|\|
	Scl.getOpcode() == ISD::ZERO_EXTEND) {
	Scl = Scl.getOperand(0);
	MinBitsPerElt =
	std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
	}
	if ((MinBitsPerElt % 8) != 0)
	return false;

	// Attempt to find the source vector the scalar was extracted from.
	SDValue SrcExtract;
	if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Scl.getOpcode() == X86ISD::PEXTRW \|\|
	Scl.getOpcode() == X86ISD::PEXTRB) &&
	Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
	SrcExtract = Scl;
	}
	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	if (!SrcVT.getScalarType().isByteSized())
	return false;
	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
	unsigned DstByte = DstIdx * NumBytesPerElt;
	MinBitsPerElt =
	std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

	// Create 'identity' byte level shuffle mask and then add inserted bytes.
	if (Opcode == ISD::SCALAR_TO_VECTOR) {
	Ops.push_back(SrcVec);
	Mask.append(NumSizeInBytes, SM_SentinelUndef);
	} else {
	Ops.push_back(SrcVec);
	Ops.push_back(N.getOperand(0));
	for (int i = 0; i != (int)NumSizeInBytes; ++i)
	Mask.push_back(NumSizeInBytes + i);
	}

	unsigned MinBytesPerElts = MinBitsPerElt / 8;
	MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
	for (unsigned i = 0; i != MinBytesPerElts; ++i)
	Mask[DstByte + i] = SrcByte + i;
	for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
	Mask[DstByte + i] = SM_SentinelZero;
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	APInt EltsLHS, EltsRHS;
	getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() &&
	DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) \|\|
	(!N1.isUndef() &&
	DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() &&
	!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) \|\|
	(!N1.isUndef() &&
	!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VTRUNC: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();
	// Truncated source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
	unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
	assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
	for (unsigned i = 0; i != NumSrcElts; ++i)
	Mask.push_back(i * Scale);
	Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
	Ops.push_back(Src);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumSizeInBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case X86ISD::VROTLI:
	case X86ISD::VROTRI: {
	// We can only decode 'whole byte' bit rotates as shuffles.
	uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
	if ((RotateVal % 8) != 0)
	return false;
	Ops.push_back(N.getOperand(0));
	int Offset = RotateVal / 8;
	Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
	for (int i = 0; i != (int)NumElts; ++i) {
	int BaseIdx = i * NumBytesPerElt;
	for (int j = 0; j != (int)NumBytesPerElt; ++j) {
	Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
	}
	}
	return true;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	if (!Src.getSimpleValueType().isVector())
	return false;
	Ops.push_back(Src);
	Mask.append(NumElts, 0);
	return true;
	}
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Extended source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;

	bool IsAnyExtend =
	(ISD::ANY_EXTEND == Opcode \|\| ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
	DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
	IsAnyExtend, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	continue;
	}

	// Check for repeated inputs.
	bool IsRepeat = false;
	for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
	if (UsedInputs[j] != Inputs[i])
	continue;
	for (int &M : Mask)
	if (lo <= M)
	M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
	IsRepeat = true;
	break;
	}
	if (IsRepeat)
	continue;

	UsedInputs.push_back(Inputs[i]);
	}
	Inputs = UsedInputs;
	}

	/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
	/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
	/// Returns true if the target shuffle mask was decoded.
	static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	APInt &KnownUndef, APInt &KnownZero,
	const SelectionDAG &DAG, unsigned Depth,
	bool ResolveKnownElts) {
	EVT VT = Op.getValueType();
	if (!VT.isSimple() \|\| !VT.isVector())
	return false;

	if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
	if (ResolveKnownElts)
	resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
	return true;
	}
	if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
	ResolveKnownElts)) {
	resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
	return true;
	}
	return false;
	}

	static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	const SelectionDAG &DAG, unsigned Depth = 0,
	bool ResolveKnownElts = true) {
	EVT VT = Op.getValueType();
	if (!VT.isSimple() \|\| !VT.isVector())
	return false;

	APInt KnownUndef, KnownZero;
	unsigned NumElts = Op.getValueType().getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
	KnownZero, DAG, Depth, ResolveKnownElts);
	}

	/// Returns the scalar element that will make up the i'th
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
	SelectionDAG &DAG, unsigned Depth) {
	if (Depth >= SelectionDAG::MaxRecursionDepth)
	return SDValue(); // Limit search depth.

	EVT VT = Op.getValueType();
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
	return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = VT.getSimpleVT();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
	ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
	SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
	}

	// Recurse into insert_subvector base/sub vector to find scalars.
	if (Opcode == ISD::INSERT_SUBVECTOR) {
	SDValue Vec = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	uint64_t SubIdx = Op.getConstantOperandVal(2);
	unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

	if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
	return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
	return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
	}

	// Recurse into concat_vectors sub vector to find scalars.
	if (Opcode == ISD::CONCAT_VECTORS) {
	EVT SubVT = Op.getOperand(0).getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	uint64_t SubIdx = Index / NumSubElts;
	uint64_t SubElt = Index % NumSubElts;
	return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
	}

	// Recurse into extract_subvector src vector to find scalars.
	if (Opcode == ISD::EXTRACT_SUBVECTOR) {
	SDValue Src = Op.getOperand(0);
	uint64_t SrcIdx = Op.getConstantOperandVal(1);
	return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
	}

	// We only peek through bitcasts of the same vector width.
	if (Opcode == ISD::BITCAST) {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
	return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
	return SDValue();
	}

	// Actual nodes that may contain scalar elements

	// For insert_vector_elt - either return the index matching scalar or recurse
	// into the base vector.
	if (Opcode == ISD::INSERT_VECTOR_ELT &&
	isa<ConstantSDNode>(Op.getOperand(2))) {
	if (Op.getConstantOperandAPInt(2) == Index)
	return Op.getOperand(1);
	return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
	}

	if (Opcode == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? Op.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (Opcode == ISD::BUILD_VECTOR)
	return Op.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; i += 2) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
	if (!ThisIsNonZero && !NextIsNonZero)
	continue;

	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue Elt;
	if (ThisIsNonZero) {
	if (NumZero \|\| NextIsNonZero)
	Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	else
	Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	}

	if (NextIsNonZero) {
	SDValue NextElt = Op.getOperand(i + 1);
	if (i == 0 && NumZero)
	NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
	else
	NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
	NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (ThisIsNonZero)
	Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
	else
	Elt = NextElt;
	}

	// If our first insertion is not the first index or zeros are needed, then
	// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
	// elements undefined).
	if (!V) {
	if (i != 0 \|\| NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
	DAG.getIntPtrConstant(i / 2, dl));
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If this is a splat of a pair of elements, use MOVDDUP (unless the target
	// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
	// Because we're creating a less complicated build vector here, we may enable
	// further folding of the MOVDDUP via shuffle transforms.
	if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
	Op.getOperand(0) == Op.getOperand(2) &&
	Op.getOperand(1) == Op.getOperand(3) &&
	Op.getOperand(0) != Op.getOperand(1)) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	// Create a new build vector with the first 2 elements followed by undef
	// padding, bitcast to v2f64, duplicate, and bitcast back.
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
	SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
	return DAG.getBitcast(VT, Dup);
	}

	// Find all zeroable elements.
	std::bitset<4> Zeroable, Undefs;
	for (int i = 0; i < 4; ++i) {
	SDValue Elt = Op.getOperand(i);
	Undefs[i] = Elt.isUndef();
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i = 0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op.getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZeroOrUndef = (Zeroable == Undefs)
	? DAG.getUNDEF(VT)
	: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL, true));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| !LD->isSimple())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	Align RequiredAlign(VT.getSizeInBits() / 8);
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
	if (!InferredAlign \|\| *InferredAlign < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign.value()) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
	static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
	if (ISD::isNON_EXTLoad(Elt.getNode())) {
	auto *BaseLd = cast<LoadSDNode>(Elt);
	if (!BaseLd->isSimple())
	return false;
	Ld = BaseLd;
	ByteOffset = 0;
	return true;
	}

	switch (Elt.getOpcode()) {
	case ISD::BITCAST:
	case ISD::TRUNCATE:
	case ISD::SCALAR_TO_VECTOR:
	return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
	case ISD::SRL:
	if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
	uint64_t Idx = IdxC->getZExtValue();
	if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
	ByteOffset += Idx / 8;
	return true;
	}
	}
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
	SDValue Src = Elt.getOperand(0);
	unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
	unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
	if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
	findEltLoadSrc(Src, Ld, ByteOffset)) {
	uint64_t Idx = IdxC->getZExtValue();
	ByteOffset += Idx * (SrcSizeInBits / 8);
	return true;
	}
	}
	break;
	}

	return false;
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	if ((VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	APInt LoadMask = APInt::getNullValue(NumElems);
	APInt ZeroMask = APInt::getNullValue(NumElems);
	APInt UndefMask = APInt::getNullValue(NumElems);

	SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
	SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();
	if (Elt.isUndef()) {
	UndefMask.setBit(i);
	continue;
	}
	if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode())) {
	ZeroMask.setBit(i);
	continue;
	}

	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	unsigned EltSizeInBits = Elt.getValueSizeInBits();
	if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
	return SDValue();

	if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) \|\| ByteOffsets[i] < 0)
	return SDValue();
	unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
	if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
	return SDValue();

	LoadMask.setBit(i);
	LastLoadedElt = i;
	}
	assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
	LoadMask.countPopulation()) == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.countPopulation() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.countTrailingZeros();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	EVT EltBaseVT = EltBase.getValueType();
	assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
	"Register/Memory size mismatch");
	LoadSDNode *LDBase = Loads[FirstLoadedElt];
	assert(LDBase && "Did not find base load for merging consecutive loads");
	unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
	unsigned BaseSizeInBytes = BaseSizeInBits / 8;
	int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
	assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");

	// TODO: Support offsetting the base load.
	if (ByteOffsets[FirstLoadedElt] != 0)
	return SDValue();

	// Check to see if the element's load is consecutive to the base load
	// or offset from a previous (already checked) load.
	auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
	LoadSDNode *Ld = Loads[EltIdx];
	int64_t ByteOffset = ByteOffsets[EltIdx];
	if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
	int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
	return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
	Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
	}
	return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
	EltIdx - FirstLoadedElt);
	};

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	if (!CheckConsecutiveLoad(LDBase, i)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(LDBase->isSimple() &&
	"Cannot merge volatile or atomic loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
	MMOFlags);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// Check if the base load is entirely dereferenceable.
	bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
	VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

	// LOAD - all consecutive load/undefs (must start/end with a load or be
	// entirely dereferenceable). If we have found an entire vector of loads and
	// undefs, then return a large load of the entire vector width starting at the
	// base pointer. If the vector contains zeros, then attempt to shuffle those
	// elements.
	if (FirstLoadedElt == 0 &&
	(LastLoadedElt == (int)(NumElems - 1) \|\| IsDereferenceable) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (NumElems == 1)
	return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

	if (!ZeroMask)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && VT.isVector()) {
	unsigned NumMaskElts = VT.getVectorNumElements();
	if ((NumMaskElts % NumElems) == 0) {
	unsigned Scale = NumMaskElts / NumElems;
	SmallVector<int, 4> ClearMask(NumMaskElts, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (UndefMask[i])
	continue;
	int Offset = ZeroMask[i] ? NumMaskElts : 0;
	for (unsigned j = 0; j != Scale; ++j)
	ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}
	}

	// If the upper half of a ymm/zmm load is undef then just load the lower half.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned HalfNumElems = NumElems / 2;
	if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
	SDValue HalfLD =
	EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
	DAG, Subtarget, isAfterLegalize);
	if (HalfLD)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
	HalfLD, DAG.getIntPtrConstant(0, DL));
	}
	}

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSizeInBits == 32 \|\| LoadSizeInBits == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
	: MVT::getIntegerVT(LoadSizeInBits);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
	// Allow v4f32 on SSE1 only targets.
	// FIXME: Add more isel patterns so we can just use VT directly.
	if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
	VecVT = MVT::v4f32;
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode = DAG.getMemIntrinsicNode(
	X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
	LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// BROADCAST - match the smallest possible repetition pattern, load that
	// scalar/subvector element and then broadcast to the entire vector.
	if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
	(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector())) {
	for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
	unsigned RepeatSize = SubElems * BaseSizeInBits;
	unsigned ScalarSize = std::min(RepeatSize, 64u);
	if (!Subtarget.hasAVX2() && ScalarSize < 32)
	continue;

	bool Match = true;
	SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
	for (unsigned i = 0; i != NumElems && Match; ++i) {
	if (!LoadMask[i])
	continue;
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (RepeatedLoads[i % SubElems].isUndef())
	RepeatedLoads[i % SubElems] = Elt;
	else
	Match &= (RepeatedLoads[i % SubElems] == Elt);
	}

	// We must have loads at both ends of the repetition.
	Match &= !RepeatedLoads.front().isUndef();
	Match &= !RepeatedLoads.back().isUndef();
	if (!Match)
	continue;

	EVT RepeatVT =
	VT.isInteger() && (RepeatSize != 64 \|\| TLI.isTypeLegal(MVT::i64))
	? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
	: EVT::getFloatingPointVT(ScalarSize);
	if (RepeatSize > ScalarSize)
	RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
	RepeatSize / ScalarSize);
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
	VT.getSizeInBits() / ScalarSize);
	if (TLI.isTypeLegal(BroadcastVT)) {
	if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
	RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
	unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
	: X86ISD::VBROADCAST;
	SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
	return DAG.getBitcast(VT, Broadcast);
	}
	}
	}
	}

	return SDValue();
	}

	// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
	// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
	// are consecutive, non-overlapping, and in the right order.
	static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	SmallVector<SDValue, 64> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	return SDValue();
	}
	assert(Elts.size() == VT.getVectorNumElements());
	return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
	isAfterLegalize);
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isFoldableUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	unsigned Opc = U->getOpcode();
	// VPERMV/VPERMV3 shuffles can never fold their index operands.
	if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
	return false;
	if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
	return false;
	if (isTargetShuffle(Opc))
	return true;
	if (Opc == ISD::BITCAST) // Ignore bitcasts
	return isFoldableUseOfShuffle(U);
	if (N->hasOneUse())
	return true;
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this pattern:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	MVT MaskVT = BOperand.getSimpleValueType();
	if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefElts = UndefElements.count();
	if (!Ld \|\| (NumElts - NumUndefElts) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isFoldableUseOfShuffle(BVOp))
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize == 32 \|\| SplatBitSize == 64 \|\|
	(SplatBitSize < 32 && Subtarget.hasAVX2())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
	SDVTList Tys =
	DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CP};
	MachinePointerInfo MPI =
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
	SDValue Brdcst = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
	MachineMemOperand::MOLoad);
	return DAG.getBitcast(VT, Brdcst);
	}
	if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	// If we are moving a scalar into a vector (Ld must be set and all elements
	// but 1 are undef) and that operation is not obviously supported by
	// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
	// That's better than general shuffling and may eliminate a load to GPR and
	// move from scalar to vector register.
	if (!Ld \|\| NumElts - NumUndefElts != 1)
	return SDValue();
	unsigned ScalarSize = Ld.getValueSizeInBits();
	if (!(UndefElements[0] \|\| (ScalarSize != 32 && ScalarSize != 64)))
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);
	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	// FIXME: Is the use count needed for non-constant, non-load case?
	if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.shouldOptForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CP};
	MachinePointerInfo MPI =
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
	return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
	MPI, Alignment, MachineMemOperand::MOLoad);
	}
	}

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	// Make sure the non-chain result is only used by this build vector.
	if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64)) {
	auto *LN = cast<LoadSDNode>(Ld);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	SDValue BCast =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
	return BCast;
	}

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
	(ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)) {
	auto *LN = cast<LoadSDNode>(Ld);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	SDValue BCast =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
	return BCast;
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()) \|\|
	ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
	Immediate \|= (InC->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	} else {
	NonConstIdx.push_back(idx);
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat) {
	// The build_vector allows the scalar element to be larger than the vector
	// element type. We need to mask it to use as a condition unless we know
	// the upper bits are zero.
	// FIXME: Use computeKnownBits instead of checking specific opcode?
	SDValue Cond = Op.getOperand(SplatIdx);
	assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
	if (Cond.getOpcode() != ISD::SETCC)
	Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
	DAG.getConstant(1, dl, MVT::i8));

	// Perform the select in the scalar domain so we can use cmov.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
	DAG.getAllOnesConstant(dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	Select = DAG.getBitcast(MVT::v32i1, Select);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
	} else {
	MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
	SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
	DAG.getAllOnesConstant(dl, ImmVT),
	DAG.getConstant(0, dl, ImmVT));
	MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
	Select = DAG.getBitcast(VecVT, Select);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
	DAG.getIntPtrConstant(0, dl));
	}
	}

	// insert elements one by one
	SDValue DstVec;
	if (HasConstElts) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
	SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
	ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
	ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
	DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
	} else {
	MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
	SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
	MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
	DstVec = DAG.getBitcast(VecVT, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
	DAG.getIntPtrConstant(0, dl));
	}
	} else
	DstVec = DAG.getUNDEF(VT);

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
	/// may not match the layout of an x86 256-bit horizontal instruction.
	/// In other words, if this returns true, then some extraction/insertion will
	/// be required to produce a valid horizontal instruction.
	///
	/// Parameter \p Opcode defines the kind of horizontal operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	///
	/// TODO: This function was originally used to match both real and fake partial
	/// horizontal operations, but the index-matching logic is incorrect for that.
	/// See the corrected implementation in isHopBuildVector(). Can we reduce this
	/// code because it is only used for partial h-op matching now?
	static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);
	assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = Op0.getConstantOperandVal(1);
	unsigned I1 = Op1.getConstantOperandVal(1);

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB/SUBADD operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
	/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
	/// \p Opnd0 and \p Opnd1.
	static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts,
	bool &IsSubAdd) {

	MVT VT = BV->getSimpleValueType(0);
	if (!Subtarget.hasSSE3() \|\| !VT.isFloatingPoint())
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding/subtracting two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting/adding two integer/float elements.
	unsigned Opc[2] = {0, 0};
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF)
	continue;

	// Early exit if we found an unexpected opcode.
	if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = Op0.getConstantOperandVal(1);
	if (I0 != i)
	return false;

	// We found a valid add/sub node, make sure its the same opcode as previous
	// elements for this parity.
	if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
	return false;
	Opc[i % 2] = Opcode;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (Opcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Ensure we have found an opcode for both parities and that they are
	// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
	// inputs are undef.
	if (!Opc[0] \|\| !Opc[1] \|\| Opc[0] == Opc[1] \|\|
	InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	IsSubAdd = Opc[0] == ISD::FADD;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
	/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
	/// X86ISD::FMSUBADD node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
	IsSubAdd))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	// We only support ADDSUB.
	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
	unsigned &HOpcode, SDValue &V0, SDValue &V1) {
	// Initialize outputs to known values.
	MVT VT = BV->getSimpleValueType(0);
	HOpcode = ISD::DELETED_NODE;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
	// half of the result is calculated independently from the 128-bit halves of
	// the inputs, so that makes the index-checking logic below more complicated.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned GenericOpcode = ISD::DELETED_NODE;
	unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
	unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
	unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
	for (unsigned i = 0; i != Num128BitChunks; ++i) {
	for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
	// Ignore undef elements.
	SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
	if (Op.isUndef())
	continue;

	// If there's an opcode mismatch, we're done.
	if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
	return false;

	// Initialize horizontal opcode.
	if (HOpcode == ISD::DELETED_NODE) {
	GenericOpcode = Op.getOpcode();
	switch (GenericOpcode) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default: return false;
	}
	}

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0.getOperand(0) != Op1.getOperand(0) \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\| !Op.hasOneUse())
	return false;

	// The source vector is chosen based on which 64-bit half of the
	// destination vector is being calculated.
	if (j < NumEltsIn64Bits) {
	if (V0.isUndef())
	V0 = Op0.getOperand(0);
	} else {
	if (V1.isUndef())
	V1 = Op0.getOperand(0);
	}

	SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
	if (SourceVec != Op0.getOperand(0))
	return false;

	// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
	unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
	unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
	unsigned ExpectedIndex = i * NumEltsIn128Bits +
	(j % NumEltsIn64Bits) * 2;
	if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
	continue;

	// If this is not a commutative op, this does not match.
	if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
	return false;

	// Addition is commutative, so try swapping the extract indexes.
	// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
	if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
	continue;

	// Extract indexes do not match horizontal requirement.
	return false;
	}
	}
	// We matched. Opcode and operands are returned by reference as arguments.
	return true;
	}

	static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
	SelectionDAG &DAG, unsigned HOpcode,
	SDValue V0, SDValue V1) {
	// If either input vector is not the same size as the build vector,
	// extract/insert the low bits to the correct size.
	// This is free (examples: zmm --> xmm, xmm --> ymm).
	MVT VT = BV->getSimpleValueType(0);
	unsigned Width = VT.getSizeInBits();
	if (V0.getValueSizeInBits() > Width)
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
	else if (V0.getValueSizeInBits() < Width)
	V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

	if (V1.getValueSizeInBits() > Width)
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
	else if (V1.getValueSizeInBits() < Width)
	V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

	unsigned NumElts = VT.getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	if (BV->getOperand(i).isUndef())
	DemandedElts.clearBit(i);

	// If we don't need the upper xmm, then perform as a xmm hop.
	unsigned HalfNumElts = NumElts / 2;
	if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
	SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
	return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
	}

	return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We need at least 2 non-undef elements to make this worthwhile by default.
	unsigned NumNonUndefs =
	count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
	if (NumNonUndefs < 2)
	return SDValue();

	// There are 4 sets of horizontal math operations distinguished by type:
	// int/FP at 128-bit/256-bit. Each type was introduced with a different
	// subtarget feature. Try to match those "native" patterns first.
	MVT VT = BV->getSimpleValueType(0);
	if (((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) \|\|
	((VT == MVT::v8i16 \|\| VT == MVT::v4i32) && Subtarget.hasSSSE3()) \|\|
	((VT == MVT::v8f32 \|\| VT == MVT::v4f64) && Subtarget.hasAVX()) \|\|
	((VT == MVT::v16i16 \|\| VT == MVT::v8i32) && Subtarget.hasAVX2())) {
	unsigned HOpcode;
	SDValue V0, V1;
	if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
	return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
	}

	// Try harder to match 256-bit ops by using extract/concat.
	if (!Subtarget.hasAVX() \|\| !VT.is256BitVector())
	return SDValue();

	// Count the number of UNDEF operands in the build_vector in input.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned Half = NumElts / 2;
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
	InVec1) &&
	isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binops followed by
	// a concat vector. We must adjust the outputs from the partial horizontal
	// matching calls above to account for undefined vector halves.
	SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
	SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
	assert((!V0.isUndef() \|\| !V1.isUndef()) && "Horizontal-op of undefs?");
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
	isUndefHI);
	}
	}

	if (VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) {
	unsigned X86Opcode;
	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	bool IsShift = false;
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	IsShift = true;
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();

	// Extend shift amounts.
	if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
	if (!IsShift)
	return SDValue();
	RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
	}

	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	// Limit to shifts by uniform immediates.
	// TODO: Only accept vXi8/vXi64 special cases?
	// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
	if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
	return SDValue();

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

	if (!IsShift)
	return Res;

	// Immediately lower the shift to ensure the constant build vector doesn't
	// get converted to a constant pool before the shift is lowered.
	return LowerShift(Res, Subtarget, DAG);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
	/// from a vector of source values and a vector of extraction indices.
	/// The vectors might be manipulated to match the type of the permute op.
	static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
	SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT ShuffleVT = VT;
	EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Adjust IndicesVec to match VT size.
	assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
	"Illegal variable permute mask size");
	if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
	IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
	NumElts * VT.getScalarSizeInBits());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

	// Handle SrcVec that don't match VT type.
	if (SrcVec.getValueSizeInBits() != SizeInBits) {
	if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
	// Handle larger SrcVec by treating it as a larger permute.
	unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
	VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
	IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
	Subtarget, DAG, SDLoc(IndicesVec));
	SDValue NewSrcVec =
	createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	if (NewSrcVec)
	return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
	return SDValue();
	} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
	// Widen smaller SrcVec to match VT.
	SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
	} else
	return SDValue();
	}

	auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
	assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
	EVT SrcVT = Idx.getValueType();
	unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
	uint64_t IndexScale = 0;
	uint64_t IndexOffset = 0;

	// If we're scaling a smaller permute op, then we need to repeat the
	// indices, scaling and offsetting them as well.
	// e.g. v4i32 -> v16i8 (Scale = 4)
	// IndexScale = v4i32 Splat(4 << 24 \| 4 << 16 \| 4 << 8 \| 4)
	// IndexOffset = v4i32 Splat(3 << 24 \| 2 << 16 \| 1 << 8 \| 0)
	for (uint64_t i = 0; i != Scale; ++i) {
	IndexScale \|= Scale << (i * NumDstBits);
	IndexOffset \|= i << (i * NumDstBits);
	}

	Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
	Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
	return Idx;
	};

	unsigned Opcode = 0;
	switch (VT.SimpleTy) {
	default:
	break;
	case MVT::v16i8:
	if (Subtarget.hasSSSE3())
	Opcode = X86ISD::PSHUFB;
	break;
	case MVT::v8i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	if (Subtarget.hasAVX()) {
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v4f32;
	} else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	if (Subtarget.hasAVX()) {
	// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v2f64;
	} else if (Subtarget.hasSSE41()) {
	// SSE41 can compare v2i64 - select between indices 0 and 1.
	return DAG.getSelectCC(
	DL, IndicesVec,
	getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
	ISD::CondCode::SETEQ);
	}
	break;
	case MVT::v32i8:
	if (Subtarget.hasVLX() && Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasXOP()) {
	SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
	SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
	} else if (Subtarget.hasAVX()) {
	SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
	SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
	auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Permute Lo and Hi and then select based on index range.
	// This works as SHUFB uses bits[3:0] to permute elements and we don't
	// care about the bit[7] as its just an index vector.
	SDValue Idx = Ops[2];
	EVT VT = Idx.getValueType();
	return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
	ISD::CondCode::SETGT);
	};
	SDValue Ops[] = {LoLo, HiHi, IndicesVec};
	return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
	PSHUFBBuilder);
	}
	break;
	case MVT::v16i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	// Scale to v32i8 and perform as v32i8.
	IndicesVec = ScaleIndices(IndicesVec, 2);
	return DAG.getBitcast(
	VT, createVariablePermute(
	MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
	DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
	}
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (Subtarget.hasAVX2())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
	SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{0, 1, 2, 3, 0, 1, 2, 3});
	SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{4, 5, 6, 7, 4, 5, 6, 7});
	if (Subtarget.hasXOP())
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
	IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPS only uses index bits[0:1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
	SDLoc(SrcVec));
	IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
	DAG, SDLoc(IndicesVec));
	SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
	DAG, Subtarget);
	return extract256BitVector(Res, 0, DAG, DL);
	}
	Opcode = X86ISD::VPERMV;
	} else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
	SDValue LoLo =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
	SDValue HiHi =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
	// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	if (Subtarget.hasXOP())
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
	IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPD only uses index bit[1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v64i8:
	if (Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v32i16:
	if (Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8f64:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	Opcode = X86ISD::VPERMV;
	break;
	}
	if (!Opcode)
	return SDValue();

	assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
	(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
	"Illegal variable permute shuffle type");

	uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
	if (Scale > 1)
	IndicesVec = ScaleIndices(IndicesVec, Scale);

	EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
	IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

	SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
	SDValue Res = Opcode == X86ISD::VPERMV
	? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
	: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
	return DAG.getBitcast(VT, Res);
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getAPIntValue() != Idx)
	return SDValue();
	}

	SDLoc DL(V);
	MVT VT = V.getSimpleValueType();
	return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
	return BitOp;

	unsigned EVTBits = EltVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, fall back to a shuffle to get the scalar blended with the
	// constants. Insertion into a zero vector is handled as a special-case
	// somewhere below here.
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	LLVMContext &Context = *DAG.getContext();
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getVectorIdxConstant(i, dl);
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
	unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
	if (InsertC < NumEltsInLow128Bits)
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

	// There's no good way to insert into the high elements of a >128-bit
	// vector, so use shuffles to avoid an extract/insert sequence.
	assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
	assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
	SmallVector<int, 8> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i)
	ShuffleMask.push_back(i == InsertC ? NumElts : i);
	SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
	return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	(EltVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	{
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// If this is a splat of pairs of 32-bit elements, we can use a narrower
	// build_vector and broadcast it.
	// TODO: We could probably generalize this more.
	if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
	// Make sure all the even/odd operands match.
	for (unsigned i = 2; i != NumElems; ++i)
	if (Ops[i % 2] != Op.getOperand(i))
	return false;
	return true;
	};
	if (CanSplat(Op, NumElems, Ops)) {
	MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
	MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
	// Create a new build vector and cast to v2i64/v2f64.
	SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
	DAG.getBuildVector(NarrowVT, dl, Ops));
	// Broadcast from v2i64/v2f64 and cast to final VT.
	MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
	NewBV));
	}
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.getSizeInBits() > 128) {
	MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	return concatSubVectors(Lower, Upper, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	// TODO: Detect subvector broadcast here instead of DAG combine?
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	unsigned NumOperands = Op.getNumOperands();
	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	unsigned NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= 1 << i;
	++NumNonZero;
	}
	}

	// If we have more than 2 non-zeros, build each half separately.
	if (NumNonZero > 2) {
	MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// Otherwise, build it up through insert_subvectors.
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);

	MVT SubVT = Op.getOperand(0).getSimpleValueType();
	unsigned NumSubElems = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumOperands; ++i) {
	if ((NonZeros & (1 << i)) == 0)
	continue;

	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
	Op.getOperand(i),
	DAG.getIntPtrConstant(i * NumSubElems, dl));
	}

	return Vec;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	// TODO: Merge this with LowerAVXCONCAT_VECTORS?
	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	uint64_t Zeros = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	Zeros \|= (uint64_t)1 << i;
	else
	NonZeros \|= (uint64_t)1 << i;
	}

	unsigned NumElems = ResVT.getVectorNumElements();

	// If we are inserting non-zero vector and there are zeros in LSBs and undef
	// in the MSBs we need to emit a KSHIFTL. The generic lowering to
	// insert_subvector will give us two kshifts.
	if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
	Log2_64(NonZeros) != NumOperands - 1) {
	MVT ShiftVT = ResVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	unsigned Idx = Log2_64(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
	DAG.getUNDEF(ShiftVT), SubVec,
	DAG.getIntPtrConstant(0, dl));
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
	DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
	DAG.getIntPtrConstant(0, dl));
	}

	// If there are zero or one non-zeros we can handle this very simply.
	if (NonZeros == 0 \|\| isPowerOf2_64(NonZeros)) {
	SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
	if (!NonZeros)
	return Vec;
	unsigned Idx = Log2_64(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// Test whether there are elements crossing LaneSizeInBits lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask) {
	assert(LaneSizeInBits && ScalarSizeInBits &&
	(LaneSizeInBits % ScalarSizeInBits) == 0 &&
	"Illegal shuffle lane size");
	int LaneSize = LaneSizeInBits / ScalarSizeInBits;
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
	}

	/// Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
	SmallVector<int, 32> RepeatedMask;
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in
	/// both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask,
	SDValue V1 = SDValue(),
	SDValue V2 = SDValue()) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;
	assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
	"Illegal target shuffle mask");

	// Check for out-of-range target shuffle mask indices.
	if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
	return false;

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
	BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
	BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] == SM_SentinelUndef \|\| Mask[i] == ExpectedMask[i])
	continue;
	if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (MaskBV && ExpectedBV &&
	MaskBV->getOperand(Mask[i] % Size) ==
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	continue;
	}
	// TODO - handle SM_Sentinel equivalences.
	return false;
	}
	return true;
	}

	// Attempt to create a shuffle mask from a VSELECT condition mask.
	static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
	SDValue Cond) {
	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return false;

	unsigned Size = Cond.getValueType().getVectorNumElements();
	Mask.resize(Size, SM_SentinelUndef);

	for (int i = 0; i != (int)Size; ++i) {
	SDValue CondElt = Cond.getOperand(i);
	Mask[i] = i;
	// Arbitrarily choose from the 2nd operand if the select condition element
	// is undef.
	// TODO: Can we do better by matching patterns such as even/odd?
	if (CondElt.isUndef() \|\| isNullConstant(CondElt))
	Mask[i] += Size;
	}

	return true;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
	// Create 128-bit vector type based on mask size.
	MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
	MVT VT = MVT::getVectorVT(EltVT, Mask.size());

	// We can't assume a canonical shuffle mask, so try the commuted version too.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);

	// Match any of unary/binary or low/high.
	for (unsigned i = 0; i != 4; ++i) {
	SmallVector<int, 16> UnpackMask;
	createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
	if (isTargetShuffleEquivalent(Mask, UnpackMask) \|\|
	isTargetShuffleEquivalent(CommutedMask, UnpackMask))
	return true;
	}
	return false;
	}

	/// Return true if a shuffle mask chooses elements identically in its top and
	/// bottom halves. For example, any splat mask has the same top and bottom
	/// halves. If an element is undefined in only one half of the mask, the halves
	/// are not considered identical.
	static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
	assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
	unsigned HalfSize = Mask.size() / 2;
	for (unsigned i = 0; i != HalfSize; ++i) {
	if (Mask[i] != Mask[i + HalfSize])
	return false;
	}
	return true;
	}

	/// Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
	}

	static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1, SDValue V2,
	SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
	/// followed by unpack 256-bit.
	static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 32> Unpckl, Unpckh;
	createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
	createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

	unsigned UnpackOpcode;
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	UnpackOpcode = X86ISD::UNPCKL;
	else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	UnpackOpcode = X86ISD::UNPCKH;
	else
	return SDValue();

	// This is a "natural" unpack operation (rather than the 128-bit sectored
	// operation implemented by AVX). We need to rearrange 64-bit chunks of the
	// input in order to use the x86 instruction.
	V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
	DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
	}

	// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
	// source into the lower elements and zeroing the upper elements.
	// TODO: Merge with matchShuffleAsVPMOV.
	static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
	ArrayRef<int> Mask, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	if (!VT.is512BitVector() && !Subtarget.hasVLX())
	return false;

	unsigned NumElts = Mask.size();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	unsigned MaxScale = 64 / EltSizeInBits;

	for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
	unsigned SrcEltBits = EltSizeInBits * Scale;
	if (SrcEltBits < 32 && !Subtarget.hasBWI())
	continue;
	unsigned NumSrcElts = NumElts / Scale;
	if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
	continue;
	unsigned UpperElts = NumElts - NumSrcElts;
	if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
	continue;
	SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
	SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
	DstVT = MVT::getIntegerVT(EltSizeInBits);
	if ((NumSrcElts * EltSizeInBits) >= 128) {
	// ISD::TRUNCATE
	DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
	} else {
	// X86ISD::VTRUNC
	DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
	}
	return true;
	}

	return false;
	}

	static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
	int Delta) {
	int Size = (int)Mask.size();
	int Split = Size / Delta;
	int TruncatedVectorStart = SwappedOps ? Size : 0;

	// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
	if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
	return false;

	// The rest of the mask should not refer to the truncated vector's elements.
	if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
	TruncatedVectorStart + Size))
	return false;

	return true;
	}

	// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
	//
	// An example is the following:
	//
	// t0: ch = EntryToken
	// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
	// t25: v4i32 = truncate t2
	// t41: v8i16 = bitcast t25
	// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
	// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
	// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
	// t18: v2i64 = bitcast t51
	//
	// Without avx512vl, this is lowered to:
	//
	// vpmovqd %zmm0, %ymm0
	// vpshufb {{.*#+}} xmm0 =
	// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
	//
	// But when avx512vl is available, one can just use a single vpmovdw
	// instruction.
	static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (VT != MVT::v16i8 && VT != MVT::v8i16)
	return SDValue();

	if (Mask.size() != VT.getVectorNumElements())
	return SDValue();

	bool SwappedOps = false;

	if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
	if (!ISD::isBuildVectorAllZeros(V1.getNode()))
	return SDValue();

	std::swap(V1, V2);
	SwappedOps = true;
	}

	// Look for:
	//
	// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
	// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
	//
	// and similar ones.
	if (V1.getOpcode() != ISD::BITCAST)
	return SDValue();
	if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue Src = V1.getOperand(0).getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// The vptrunc** instructions truncating 128 bit and 256 bit vectors
	// are only available with avx512vl.
	if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
	return SDValue();

	// Down Convert Word to Byte is only available with avx512bw. The case with
	// 256-bit output doesn't contain a shuffle and is therefore not handled here.
	if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
	!Subtarget.hasBWI())
	return SDValue();

	// The first half/quarter of the mask should refer to every second/fourth
	// element of the vector truncated and bitcasted.
	if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
	!matchShuffleAsVPMOV(Mask, SwappedOps, 4))
	return SDValue();

	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
	}

	/// Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	// Checks for compaction shuffle masks if MaxStages > 1.
	// TODO: Add support for matching multiple PACKSS/PACKUS stages.
	static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
	unsigned &PackOpcode, ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned MaxStages = 1) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
	"Illegal maximum compaction");

	auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
	unsigned NumSrcBits = PackVT.getScalarSizeInBits();
	unsigned NumPackedBits = NumSrcBits - BitSize;
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if (Subtarget.hasSSE41() \|\| BitSize == 8) {
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}
	return false;
	};

	// Attempt to match against wider and wider compaction patterns.
	for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
	MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false, NumStages);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
	if (MatchPACK(V1, V2, PackVT))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true, NumStages);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
	if (MatchPACK(V1, V1, PackVT))
	return true;
	}

	return false;
	}

	static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	unsigned SizeBits = VT.getSizeInBits();
	unsigned EltBits = VT.getScalarSizeInBits();
	unsigned MaxStages = Log2_32(64 / EltBits);
	if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget, MaxStages))
	return SDValue();

	unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
	unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

	// Don't lower multi-stage packs on AVX512, truncation is better.
	if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
	return SDValue();

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	unsigned MaxPackBits = 16;
	if (CurrentEltBits > 16 &&
	(PackOpcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41()))
	MaxPackBits = 32;

	// Repeatedly pack down to the target size.
	SDValue Res;
	for (unsigned i = 0; i != NumStages; ++i) {
	unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
	unsigned NumSrcElts = SizeBits / SrcEltBits;
	MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
	MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
	MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
	MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
	Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
	DAG.getBitcast(SrcVT, V2));
	V1 = V2 = Res;
	CurrentEltBits /= 2;
	}
	assert(Res && Res.getValueType() == VT &&
	"Failed to lower compaction shuffle");
	return Res;
	}

	/// Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT MaskVT = VT;
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero, AllOnes;
	// Use f64 if i64 isn't legal.
	if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
	EltVT = MVT::f64;
	MaskVT = MVT::getVectorVT(EltVT, Mask.size());
	}

	MVT LogicVT = VT;
	if (EltVT == MVT::f32 \|\| EltVT == MVT::f64) {
	Zero = DAG.getConstantFP(0.0, DL, EltVT);
	APFloat AllOnesValue = APFloat::getAllOnesValue(
	SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
	AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
	LogicVT =
	MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
	} else {
	Zero = DAG.getConstant(0, DL, EltVT);
	AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	}

	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
	VMask = DAG.getBitcast(LogicVT, VMask);
	V = DAG.getBitcast(LogicVT, V);
	SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
	return DAG.getBitcast(VT, And);
	}

	/// Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> Mask,
	const APInt &Zeroable, bool &ForceV1Zero,
	bool &ForceV2Zero, uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (Zeroable[i]) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	Mask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	Mask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 64> Mask(Original.begin(), Original.end());
	if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v8f32:
	assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
	LLVM_FALLTHROUGH;
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getTargetConstant(BlendMask, DL, MVT::i8));
	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(BlendMask, DL, MVT::i8));
	}
	// Use PBLENDW for lower/upper lanes and then blend lanes.
	// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
	// merge to VSELECT where useful.
	uint64_t LoMask = BlendMask & 0xFF;
	uint64_t HiMask = (BlendMask >> 8) & 0xFF;
	if (LoMask == 0 \|\| LoMask == 255 \|\| HiMask == 0 \|\| HiMask == 255) {
	SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(LoMask, DL, MVT::i8));
	SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getTargetConstant(HiMask, DL, MVT::i8));
	return DAG.getVectorShuffle(
	MVT::v16i16, DL, Lo, Hi,
	{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v32i8:
	assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v16i8: {
	assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// If we have VPTERNLOG, we can use that as a bit blend.
	if (Subtarget.hasVLX())
	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return BitBlend;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// x86 allows load folding with blendvb from the 2nd source operand. But
	// we are still using LLVM select here (see comment below), so that's V1.
	// If V2 can be load-folded and V1 cannot be load-folded, then commute to
	// allow that load-folding possibility.
	if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
	bool OptForSize = DAG.shouldOptForSize();
	if (!OptForSize) {
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;
	}

	// Otherwise load an immediate into a GPR, cast to k-register, and use a
	// masked move.
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	bool ImmBlends = false) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	// If only immediate blends, then bail if the blend mask can't be widened to
	// i16.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
	return SDValue();

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Try to lower as an unpack of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can unpack elements from two inputs and
	/// then reduce the shuffle to a single-input (wider) permutation.
	static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;
	int NumHalfLaneElts = NumLaneElts / 2;

	bool MatchLo = true, MatchHi = true;
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

	// Determine UNPCKL/UNPCKH type and operand order.
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;

	SDValue &Op = Ops[Elt & 1];
	if (M < NumElts && (Op.isUndef() \|\| Op == V1))
	Op = V1;
	else if (NumElts <= M && (Op.isUndef() \|\| Op == V2))
	Op = V2;
	else
	return SDValue();

	int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
	MatchLo &= isUndefOrInRange(M, Lo, Mid) \|\|
	isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
	MatchHi &= isUndefOrInRange(M, Mid, Hi) \|\|
	isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
	if (!MatchLo && !MatchHi)
	return SDValue();
	}
	}
	assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");

	// Now check that each pair of elts come from the same unpack pair
	// and set the permute mask based on each pair.
	// TODO - Investigate cases where we permute individual elements.
	SmallVector<int, 32> PermuteMask(NumElts, -1);
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
	int M0 = Mask[Lane + Elt + 0];
	int M1 = Mask[Lane + Elt + 1];
	if (0 <= M0 && 0 <= M1 &&
	(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
	return SDValue();
	if (0 <= M0)
	PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
	if (0 <= M1)
	PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
	}
	}

	unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
	return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
	/// permuting the elements of the result in place.
	static SDValue lowerShuffleAsByteRotateAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) \|\|
	(VT.is256BitVector() && !Subtarget.hasAVX2()) \|\|
	(VT.is512BitVector() && !Subtarget.hasBWI()))
	return SDValue();

	// We don't currently support lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	int Scale = VT.getScalarSizeInBits() / 8;
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = VT.getVectorNumElements();
	int NumEltsPerLane = NumElts / NumLanes;

	// Determine range of mask elts.
	bool Blend1 = true;
	bool Blend2 = true;
	std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
	std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts) {
	Blend1 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range1.first = std::min(Range1.first, M);
	Range1.second = std::max(Range1.second, M);
	} else {
	M -= NumElts;
	Blend2 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range2.first = std::min(Range2.first, M);
	Range2.second = std::max(Range2.second, M);
	}
	}
	}

	// Bail if we don't need both elements.
	// TODO - it might be worth doing this for unary shuffles if the permute
	// can be widened.
	if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) \|\|
	!(0 <= Range2.first && Range2.second < NumEltsPerLane))
	return SDValue();

	if (VT.getSizeInBits() > 128 && (Blend1 \|\| Blend2))
	return SDValue();

	// Rotate the 2 ops so we can access both ranges, then permute the result.
	auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue Rotate = DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
	DAG.getBitcast(ByteVT, Lo),
	DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
	SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts)
	PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
	else
	PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
	}
	}
	return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
	};

	// Check if the ranges are small enough to rotate from either direction.
	if (Range2.second < Range1.first)
	return RotateAndPermute(V1, V2, Range1.first, 0);
	if (Range1.second < Range2.first)
	return RotateAndPermute(V2, V1, Range2.first, NumElts);
	return SDValue();
	}

	/// Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerShuffleAsDecomposedShuffleBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend/unpack/rotate strategies unless
	// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
	// the shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
	// pre-shuffle first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
	// Only prefer immediate blends to unpack/rotate.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG, true))
	return BlendPerm;
	if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return UnpackPerm;
	if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
	DL, VT, V1, V2, Mask, Subtarget, DAG))
	return RotatePerm;
	// Unpack/rotate failed - try again with variable blends.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return BlendPerm;
	}

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// Try to lower a vector shuffle as a bit rotation.
	///
	/// Look for a repeated rotation pattern in each sub group.
	/// Returns a ISD::ROTL element rotation amount or -1 if failed.
	static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
	int NumElts = Mask.size();
	assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");

	int RotateAmt = -1;
	for (int i = 0; i != NumElts; i += NumSubElts) {
	for (int j = 0; j != NumSubElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	if (!isInRange(M, i, i + NumSubElts))
	return -1;
	int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
	if (0 <= RotateAmt && Offset != RotateAmt)
	return -1;
	RotateAmt = Offset;
	}
	}
	return RotateAmt;
	}

	static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
	const X86Subtarget &Subtarget,
	ArrayRef<int> Mask) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");

	// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
	int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
	int MaxSubElts = 64 / EltSizeInBits;
	for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
	int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
	if (RotateAmt < 0)
	continue;

	int NumElts = Mask.size();
	MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
	RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
	return RotateAmt * EltSizeInBits;
	}

	return -1;
	}

	/// Lower shuffle using X86ISD::VROTLI rotations.
	static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Only XOP + AVX512 targets have bit rotation instructions.
	// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
	bool IsLegal =
	(VT.is128BitVector() && Subtarget.hasXOP()) \|\| Subtarget.hasAVX512();
	if (!IsLegal && Subtarget.hasSSE3())
	return SDValue();

	MVT RotateVT;
	int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
	Subtarget, Mask);
	if (RotateAmt < 0)
	return SDValue();

	// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
	// expanded to OR(SRL,SHL), will be more efficient, but if they can
	// widen to vXi16 or more then existing lowering should will be better.
	if (!IsLegal) {
	if ((RotateAmt % 16) == 0)
	return SDValue();
	// TODO: Use getTargetVShiftByConstNode.
	unsigned ShlAmt = RotateAmt;
	unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
	V1 = DAG.getBitcast(RotateVT, V1);
	SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
	DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
	SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
	DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
	SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
	return DAG.getBitcast(VT, Rot);
	}

	SDValue Rot =
	DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
	DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, Rot);
	}

	/// Try to match a vector shuffle as an element rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (isAnyZero(Mask))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift =
	DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift =
	DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getTargetConstant(Rotation, DL, MVT::i8));
	}

	/// Try to lower a vector shuffle as a byte shift sequence.
	static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(VT.is128BitVector() && "Only 128-bit vectors supported");

	// We need a shuffle that has zeros at one/both ends and a sequential
	// shuffle from one source within.
	unsigned ZeroLo = Zeroable.countTrailingOnes();
	unsigned ZeroHi = Zeroable.countLeadingOnes();
	if (!ZeroLo && !ZeroHi)
	return SDValue();

	unsigned NumElts = Mask.size();
	unsigned Len = NumElts - (ZeroLo + ZeroHi);
	if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
	return SDValue();

	unsigned Scale = VT.getScalarSizeInBits() / 8;
	ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
	if (!isUndefOrInRange(StubMask, 0, NumElts) &&
	!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
	return SDValue();

	SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
	Res = DAG.getBitcast(MVT::v16i8, Res);

	// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
	// inner sequential set of elements, possibly offset:
	// 01234567 --> zzzzzz01 --> 1zzzzzzz
	// 01234567 --> 4567zzzz --> zzzzz456
	// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
	if (ZeroLo == 0) {
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
	} else if (ZeroHi == 0) {
	unsigned Shift = Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
	} else if (!Subtarget.hasSSSE3()) {
	// If we don't have PSHUFB then its worth avoiding an AND constant mask
	// by performing 3 byte shifts. Shuffle combining can kick in above that.
	// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Shift += Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
	} else
	return SDValue();

	return DAG.getBitcast(VT, Res);
	}

	/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));

	if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid a/zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
	ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getTargetConstant(EltBits, DL, MVT::i8),
	DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

	if (isUndefUpperHalf(Mask) \|\| !SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getTargetConstant(EltBits, DL, MVT::i8),
	DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	if ((i % Scale == 0 && SafeOffset(Idx))) {
	PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
	continue;
	}
	PSHUFBMask[i] =
	AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
	InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getTargetConstant(
	V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
	int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	MVT EltVT = VT.getVectorElementType();
	MVT V0VT = V0.getSimpleValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	MVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// If we are extracting two 128-bit halves of a vector and shuffling the
	/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
	/// multi-shuffle lowering.
	static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
	SDValue N1, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	MVT VT = N0.getSimpleValueType();
	assert((VT.is128BitVector() &&
	(VT.getScalarSizeInBits() == 32 \|\| VT.getScalarSizeInBits() == 64)) &&
	"VPERM* family of shuffles requires 32-bit or 64-bit elements");

	// Check that both sources are extracts of the same source vector.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse() \|\|
	N0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N0.getOperand(0) != N1.getOperand(0))
	return SDValue();

	SDValue WideVec = N0.getOperand(0);
	MVT WideVT = WideVec.getSimpleValueType();
	if (!WideVT.is256BitVector())
	return SDValue();

	// Match extracts of each half of the wide source vector. Commute the shuffle
	// if the extract of the low half is N1.
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
	const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
	if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
	ShuffleVectorSDNode::commuteMask(NewMask);
	else if (ExtIndex0 != 0 \|\| ExtIndex1 != NumElts)
	return SDValue();

	// Final bailout: if the mask is simple, we are better off using an extract
	// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
	// because that avoids a constant load from memory.
	if (NumElts == 4 &&
	(isSingleSHUFPSMask(NewMask) \|\| is128BitUnpackShuffleMask(NewMask)))
	return SDValue();

	// Extend the shuffle mask with undef elements.
	NewMask.append(NumElts, -1);

	// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
	SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
	NewMask);
	// This is free: ymm -> xmm.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = getSplatIndex(Mask);
	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	// TODO: Combine this logic with findEltLoadSrc() used by
	// EltsFromConsecutiveLoads().
	int BitOffset = BroadcastIdx * NumEltBits;
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	V = V.getOperand(0);
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OpBitWidth = V.getOperand(0).getValueSizeInBits();
	int OpIdx = BitOffset / OpBitWidth;
	V = V.getOperand(OpIdx);
	BitOffset %= OpBitWidth;
	continue;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// The extraction index adds to the existing offset.
	unsigned EltBitWidth = V.getScalarValueSizeInBits();
	unsigned Idx = V.getConstantOperandVal(1);
	unsigned BeginOffset = Idx * EltBitWidth;
	BitOffset += BeginOffset;
	V = V.getOperand(0);
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	int EltBitWidth = VOuter.getScalarValueSizeInBits();
	int Idx = (int)V.getConstantOperandVal(2);
	int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
	int BeginOffset = Idx * EltBitWidth;
	int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
	if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
	BitOffset -= BeginOffset;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}
	assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
	BroadcastIdx = BitOffset / NumEltBits;

	// Do we need to bitcast the source to retrieve the original broadcast index?
	bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// If the original value has a larger element type than the shuffle, the
	// broadcast element is in essence truncated. Make that explicit to ease
	// folding.
	if (BitCastSrc && VT.isInteger())
	if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
	DL, VT, V, BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	// Also check the simpler case, where we can directly reuse the scalar.
	if (!BitCastSrc &&
	((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (ISD::isNormalLoad(V.getNode()) &&
	cast<LoadSDNode>(V)->isSimple()) {
	// We do not check for one-use of the vector load because a broadcast load
	// is expected to be a win for code size, register pressure, and possibly
	// uops even if the original vector load is not eliminated.

	// Reduce the vector load and shuffle to a broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(V);
	SDValue BaseAddr = Ld->getOperand(1);
	MVT SVT = VT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);

	// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
	// than MOVDDUP.
	// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
	if (Opcode == X86ISD::VBROADCAST) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), NewAddr};
	V = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	return DAG.getBitcast(VT, V);
	}
	assert(SVT == MVT::f64 && "Unexpected VT!");
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BitOffset != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	if ((BitOffset % 128) != 0)
	return SDValue();

	assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
	"Unexpected bit-offset");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
	V = extract128BitVector(V, ExtractIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// If this is a scalar, do the broadcast on this type and bitcast.
	if (!V.getValueType().isVector()) {
	assert(V.getScalarValueSizeInBits() == NumEltBits &&
	"Unexpected scalar size");
	MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (V.getValueSizeInBits() > 128)
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

	// Otherwise cast V to a vector with the same element type as VT, but
	// possibly narrower than VT. Then perform the broadcast.
	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
	return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> Mask, const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	/// Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerShuffleAsPermuteAndUnpack(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If we're shuffling with a zero vector then we're better off not doing
	// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
	if (ISD::isBuildVectorAllZeros(V1.getNode()) \|\|
	ISD::isBuildVectorAllZeros(V2.getNode()))
	return SDValue();

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	X86ISD::MOVSD, DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
	V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Try to use broadcast unless the mask only has one non-undef element.
	if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;
	}

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord = 0, BDWord = 0;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
	assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
	"Lane crossing shuffle masks not supported");

	int NumBytes = VT.getSizeInBits() / 8;
	int Size = Mask.size();
	int Scale = NumBytes / Size;

	SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	V1InUse = false;
	V2InUse = false;

	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Scale];
	if (M < 0)
	continue;

	const int ZeroMask = 0x80;
	int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
	int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;

	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}

	MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
	DAG.getBuildVector(ShufVT, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
	DAG.getBuildVector(ShufVT, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use bit rotation instructions.
	if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
	Subtarget, DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
	// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
	// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
	int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
	if ((NumEvenDrops == 1 \|\| NumEvenDrops == 2) && Subtarget.hasSSE41() &&
	!Subtarget.hasVLX()) {
	SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
	for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
	DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
	SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
	V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
	DWordClearMask);
	V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
	DWordClearMask);
	// Now pack things back together.
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
	if (NumEvenDrops == 2) {
	Result = DAG.getBitcast(MVT::v4i32, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
	}
	return Result;
	}

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG);
	}

	static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use bit rotation instructions.
	if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	bool EvenInUse = false, OddInUse = false;
	for (int i = 0; i < 16; i += 2) {
	EvenInUse \|= (Mask[i + 0] >= 0);
	OddInUse \|= (Mask[i + 1] >= 0);
	if (EvenInUse && OddInUse)
	break;
	}
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
	OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	// Check for compaction patterns.
	bool IsSingleInput = V2.isUndef();
	int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// If the mask is a binary compaction, we can more efficiently perform this
	// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3() && (IsSingleInput \|\| NumEvenDrops != 1)) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);

	// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
	// PALIGNR will be cheaper than the second PSHUFB+OR.
	if (SDValue V = lowerShuffleAsByteRotateAndPermute(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return V;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Blend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	if (NumEvenDrops) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
	for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
	WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
	SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
	V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
	WordClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
	WordClearMask);

	// Now pack things back together.
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
	IsSingleInput ? V1 : V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}
	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

	// Use splitVector/extractSubVector so that split build-vectors just build two
	// narrower build vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	SDValue LoV, HiV;
	std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	Subtarget, DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
	DAG);
	}

	// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// TODO: Extend to support v8f32 (+ 512-bit shuffles).
	static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");

	int LHSMask[4] = {-1, -1, -1, -1};
	int RHSMask[4] = {-1, -1, -1, -1};
	unsigned SHUFPMask = 0;

	// As SHUFPD uses a single LHS/RHS element per lane, we can always
	// perform the shuffle once the lanes have been shuffled in place.
	for (int i = 0; i != 4; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	int LaneBase = i & ~1;
	auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
	LaneMask[LaneBase + (M & 1)] = M;
	SHUFPMask \|= (M & 1) << i;
	}

	SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
	SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
	DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a lane permutation followed by a per-lane permutation.
	///
	/// This is mainly for cases where we can have non-repeating permutes
	/// in each lane.
	///
	/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
	/// we should investigate merging them.
	static SDValue lowerShuffleAsLanePermuteAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumEltsPerLane = NumElts / NumLanes;

	SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
	SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Ensure that each lane comes from a single source lane.
	int SrcLane = M / NumEltsPerLane;
	int DstLane = i / NumEltsPerLane;
	if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
	return SDValue();
	SrcLaneMask[DstLane] = SrcLane;

	PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
	}

	// Make sure we set all elements of the lane mask, to avoid undef propagation.
	SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
	for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
	int SrcLane = SrcLaneMask[DstLane];
	if (0 <= SrcLane)
	for (int j = 0; j != NumEltsPerLane; ++j) {
	LaneMask[(DstLane * NumEltsPerLane) + j] =
	(SrcLane * NumEltsPerLane) + j;
	}
	}

	// If we're only shuffling a single lowest lane and the rest are identity
	// then don't bother.
	// TODO - isShuffleMaskInputInPlace could be extended to something like this.
	int NumIdentityLanes = 0;
	bool OnlyShuffleLowestLane = true;
	for (int i = 0; i != NumLanes; ++i) {
	if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
	i * NumEltsPerLane))
	NumIdentityLanes++;
	else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
	OnlyShuffleLowestLane = false;
	}
	if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
	return SDValue();

	SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
	return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
	/// source with a lane permutation.
	///
	/// This lowering strategy results in four instructions in the worst case for a
	/// single-input cross lane shuffle which is lower than any other fully general
	/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
	/// shuffle pattern should be handled prior to trying this lowering.
	static SDValue lowerShuffleAsLanePermuteAndShuffle(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// Only do this if the elements aren't all from the lower lane,
	// otherwise we're (probably) better off doing a split.
	if (VT == MVT::v4f64 &&
	!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
	if (SDValue V =
	lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
	return V;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	// TODO - we could support shuffling V2 in the Flipped input.
	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
	for (int i = 0; i < Size; ++i) {
	int &M = InLaneMask[i];
	if (M < 0)
	continue;
	if (((M % Size) / LaneSize) != (i / LaneSize))
	M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
	}
	assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
	"In-lane shuffle mask expected");

	// Flip the lanes, and shuffle the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped =
	DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
	}

	/// Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
	return SDValue();

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && IsHighZero) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Blend;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(2, DL));
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert((WidenedMask[0] >= 0 \|\| IsLowZero) &&
	(WidenedMask[1] >= 0 \|\| IsHighZero) && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This attempts to create a repeated lane shuffle where each lane uses one
	/// or two of the lanes of the inputs. The lanes of the input vectors are
	/// shuffled in one or two independent shuffles to get the lanes into the
	/// position needed by the final shuffle.
	static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	if (is128BitLaneRepeatedShuffleMask(VT, Mask))
	return SDValue();

	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = 128 / VT.getScalarSizeInBits();
	SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
	SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

	// First pass will try to fill in the RepeatMask from lanes that need two
	// sources.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Srcs[2] = {-1, -1};
	SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = Mask[(Lane * NumLaneElts) + i];
	if (M < 0)
	continue;
	// Determine which of the possible input lanes (NumLanes from each source)
	// this element comes from. Assign that as one of the sources for this
	// lane. We can assign up to 2 sources for this lane. If we run out
	// sources we can't do anything.
	int LaneSrc = M / NumLaneElts;
	int Src;
	if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc)
	Src = 0;
	else if (Srcs[1] < 0 \|\| Srcs[1] == LaneSrc)
	Src = 1;
	else
	return SDValue();

	Srcs[Src] = LaneSrc;
	InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
	}

	// If this lane has two sources, see if it fits with the repeat mask so far.
	if (Srcs[1] < 0)
	continue;

	LaneSrcs[Lane][0] = Srcs[0];
	LaneSrcs[Lane][1] = Srcs[1];

	auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
	assert(M1.size() == M2.size() && "Unexpected mask size");
	for (int i = 0, e = M1.size(); i != e; ++i)
	if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
	return false;
	return true;
	};

	auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
	assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
	for (int i = 0, e = MergedMask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	assert((MergedMask[i] < 0 \|\| MergedMask[i] == M) &&
	"Unexpected mask element");
	MergedMask[i] = M;
	}
	};

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Didn't find a match. Swap the operands and try again.
	std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
	ShuffleVectorSDNode::commuteMask(InLaneMask);

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Couldn't find a match with the operands in either order.
	return SDValue();
	}

	// Now handle any lanes with only one source.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	// If this lane has already been processed, skip it.
	if (LaneSrcs[Lane][0] >= 0)
	continue;

	for (int i = 0; i != NumLaneElts; ++i) {
	int M = Mask[(Lane * NumLaneElts) + i];
	if (M < 0)
	continue;

	// If RepeatMask isn't defined yet we can define it ourself.
	if (RepeatMask[i] < 0)
	RepeatMask[i] = M % NumLaneElts;

	if (RepeatMask[i] < NumElts) {
	if (RepeatMask[i] != M % NumLaneElts)
	return SDValue();
	LaneSrcs[Lane][0] = M / NumLaneElts;
	} else {
	if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
	return SDValue();
	LaneSrcs[Lane][1] = M / NumLaneElts;
	}
	}

	if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
	return SDValue();
	}

	SmallVector<int, 16> NewMask(NumElts, -1);
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][0];
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * NumLaneElts + i;
	NewMask[Lane * NumLaneElts + i] = M;
	}
	}
	SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV1) &&
	cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
	return SDValue();

	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][1];
	for (int i = 0; i != NumLaneElts; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * NumLaneElts + i;
	NewMask[Lane * NumLaneElts + i] = M;
	}
	}
	SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV2) &&
	cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
	return SDValue();

	for (int i = 0; i != NumElts; ++i) {
	NewMask[i] = RepeatMask[i % NumLaneElts];
	if (NewMask[i] < 0)
	continue;

	NewMask[i] += (i / NumLaneElts) * NumLaneElts;
	}
	return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
	}

	/// If the input shuffle mask results in a vector that is undefined in all upper
	/// or lower half elements and that mask accesses only 2 halves of the
	/// shuffle's operands, return true. A mask of half the width with mask indexes
	/// adjusted to access the extracted halves of the original shuffle operands is
	/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
	/// lower half of each input operand is accessed.
	static bool
	getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
	int &HalfIdx1, int &HalfIdx2) {
	assert((Mask.size() == HalfMask.size() * 2) &&
	"Expected input mask to be twice as long as output");

	// Exactly one half of the result must be undef to allow narrowing.
	bool UndefLower = isUndefLowerHalf(Mask);
	bool UndefUpper = isUndefUpperHalf(Mask);
	if (UndefLower == UndefUpper)
	return false;

	unsigned HalfNumElts = HalfMask.size();
	unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
	HalfIdx1 = -1;
	HalfIdx2 = -1;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + MaskIndexOffset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return false;
	}

	return true;
	}

	/// Given the output values from getHalfShuffleMask(), create a half width
	/// shuffle of extracted vectors followed by an insert back to full width.
	static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> HalfMask, int HalfIdx1,
	int HalfIdx2, bool UndefLower,
	SelectionDAG &DAG, bool UseConcat = false) {
	assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
	assert(V1.getValueType().isSimple() && "Expecting only simple types");

	MVT VT = V1.getSimpleValueType();
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	unsigned HalfNumElts = HalfVT.getVectorNumElements();

	auto getHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
	SDValue Half1 = getHalfVector(HalfIdx1);
	SDValue Half2 = getHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	if (UseConcat) {
	SDValue Op0 = V;
	SDValue Op1 = DAG.getUNDEF(HalfVT);
	if (UndefLower)
	std::swap(Op0, Op1);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
	}

	unsigned Offset = UndefLower ? HalfNumElts : 0;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	bool UndefLower = isUndefLowerHalf(Mask);
	if (!UndefLower && !isUndefUpperHalf(Mask))
	return SDValue();

	assert((!UndefLower \|\| !isUndefUpperHalf(Mask)) &&
	"Completely undef shuffle mask should have been simplified already");

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	unsigned HalfNumElts = HalfVT.getVectorNumElements();
	if (!UndefLower &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
	return SDValue();

	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	unsigned NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	unsigned NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);
	assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");

	// Determine the larger pattern of undef/halves, then decide if it's worth
	// splitting the shuffle based on subtarget capabilities and types.
	unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
	if (!UndefLower) {
	// XXXXuuuu: no insert is needed.
	// Always extract lowers when setting lower - these are all free subreg ops.
	if (NumUpperHalves == 0)
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);

	if (NumUpperHalves == 1) {
	// AVX2 has efficient 32/64-bit element cross-lane shuffles.
	if (Subtarget.hasAVX2()) {
	// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
	if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
	!is128BitUnpackShuffleMask(HalfMask) &&
	(!isSingleSHUFPSMask(HalfMask) \|\|
	Subtarget.hasFastVariableShuffle()))
	return SDValue();
	// If this is a unary shuffle (assume that the 2nd operand is
	// canonicalized to undef), then we can use vpermpd. Otherwise, we
	// are better off extracting the upper half of 1 operand and using a
	// narrow shuffle.
	if (EltWidth == 64 && V2.isUndef())
	return SDValue();
	}
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Extract + narrow shuffle is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// Don't extract both uppers, instead shuffle and then extract.
	assert(NumUpperHalves == 2 && "Half vector count went wrong");
	return SDValue();
	}

	// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
	if (NumUpperHalves == 0) {
	// AVX2 has efficient 64-bit element cross-lane shuffles.
	// TODO: Refine to account for unary shuffle, splat, and other masks?
	if (Subtarget.hasAVX2() && EltWidth == 64)
	return SDValue();
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Narrow shuffle + insert is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
	return SDValue();
	}

	/// Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	bool &ForceV1Zero, bool &ForceV2Zero,
	unsigned &ShuffleImm, ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");
	assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
	"Illegal shuffle mask");

	bool ZeroLane[2] = { true, true };
	for (int i = 0; i < NumElts; ++i)
	ZeroLane[i & 1] &= Zeroable[i];

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef \|\| ZeroLane[i & 1])
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (!ShufpdMask && !CommutableMask)
	return false;

	if (!ShufpdMask && CommutableMask)
	std::swap(V1, V2);

	ForceV1Zero = ZeroLane[0];
	ForceV2Zero = ZeroLane[1];
	return true;
	}

	static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64) &&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
	Mask, Zeroable))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getTargetConstant(Immediate, DL, MVT::i8));
	}

	// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
	// by zeroable elements in the remaining 24 elements. Turn this into two
	// vmovqb instructions shuffled together.
	static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(VT == MVT::v32i8 && "Unexpected type!");

	// The first 8 indices should be every 8th element.
	if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
	return SDValue();

	// Remaining elements need to be zeroable.
	if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
	return SDValue();

	V1 = DAG.getBitcast(MVT::v4i64, V1);
	V2 = DAG.getBitcast(MVT::v4i64, V2);

	V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
	V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

	// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
	// the upper bits of the result using an unpckldq.
	SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
	{ 0, 1, 2, 3, 16, 17, 18, 19,
	4, 5, 6, 7, 20, 21, 22, 23 });
	// Insert the unpckldq into a zero vector to widen to v32i8.
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
	DAG.getConstant(0, DL, MVT::v32i8), Unpack,
	DAG.getIntPtrConstant(0, DL));
	}


	/// Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
	Mask, DAG, Subtarget))
	return V;

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Op;

	// If we have lane crossing shuffles AND they don't all come from the lower
	// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
	// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
	// canonicalize to a blend of splat which isn't necessary for this combine.
	if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
	!all_of(Mask, [](int M) { return M < 2 \|\| (4 <= M && M < 6); }) &&
	(V1.getOpcode() != ISD::BUILD_VECTOR) &&
	(V2.getOpcode() != ISD::BUILD_VECTOR))
	if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
	Mask, DAG))
	return Op;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
	}
	if (Subtarget.hasAVX2()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
	}
	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Try to produce a fixed cross-128-bit lane permute followed by unpack
	// because that should be faster than the variable permute alternatives.
	if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it's a single input, directly
	// generate a cross-lane VPERMD instruction.
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Try to use bit rotation instructions.
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
	return Rotate;

	// Try to produce a fixed cross-128-bit lane permute followed by unpack
	// because that should be faster than the variable permute alternatives.
	if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
	DAG, Subtarget);
	}

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use bit rotation instructions.
	if (V2.isUndef())
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
	// Try to produce a fixed cross-128-bit lane permute followed by unpack
	// because that should be faster than the variable permute alternatives.
	if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
	DAG, Subtarget);
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
	// by zeroable elements in the remaining 24 elements. Turn this into two
	// vmovqb instructions shuffled together.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
	Mask, Zeroable, DAG))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
	SDValue V1, SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
	SmallVector<int, 4> Widened128Mask;
	if (!canWidenShuffleElements(Mask, Widened128Mask))
	return SDValue();
	assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");

	// Try to use an insert into a zero vector.
	if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
	(Widened128Mask[1] == 1 \|\| (Zeroable & 0x0c) == 0x0c)) {
	unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue SubVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(4, DL));
	}

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
	if (Widened128Mask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (Widened128Mask[i] < 4) {
	if (Widened128Mask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| Widened128Mask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
	// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
	// possible we at least ensure the lanes stay sequential to help later
	// combines.
	SmallVector<int, 2> Widened256Mask;
	if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
	Widened128Mask.clear();
	narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
	}

	// Try to lower to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
	if (Widened128Mask[i] < 0)
	continue;

	SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (Widened128Mask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	}

	/// Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Op;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V =
	lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	// Try to use bit rotation instructions.
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
	return Rotate;

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
	RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use bit rotation instructions.
	if (V2.isUndef())
	if (SDValue Rotate =
	lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
	return Rotate;

	// Lower as AND if possible.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (!V2.isUndef())
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// FIXME: Implement direct support for this type!
	return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI()) {
	// Try using bit ops for masking and blending before falling back to
	// splitting.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;

	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Shuffle should be unary.
	if (!V2.isUndef())
	return SDValue();

	int ShiftAmt = -1;
	int NumElts = Mask.size();
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < NumElts)) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// The first non-undef element determines our shift amount.
	if (ShiftAmt < 0) {
	ShiftAmt = M - i;
	// Need to be shifting right.
	if (ShiftAmt <= 0)
	return SDValue();
	}
	// All non-undef elements must shift by the same amount.
	if (ShiftAmt != M - i)
	return SDValue();
	}
	assert(ShiftAmt >= 0 && "All undef?");

	// Great we found a shift right.
	MVT WideVT = VT;
	if ((!Subtarget.hasDQI() && NumElts == 8) \|\| NumElts < 8)
	WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
	DAG.getUNDEF(WideVT), V1,
	DAG.getIntPtrConstant(0, DL));
	Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	// Determine if this shuffle can be implemented with a KSHIFT instruction.
	// Returns the shift amount if possible or -1 if not. This is a simplified
	// version of matchShuffleAsShift.
	static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable) {
	int Size = Mask.size();

	auto CheckZeros = [&](int Shift, bool Left) {
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, bool Left) {
	unsigned Pos = Left ? Shift : 0;
	unsigned Low = Left ? 0 : Shift;
	unsigned Len = Size - Shift;
	return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
	};

	for (int Shift = 1; Shift != Size; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
	Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
	return Shift;
	}

	return -1;
	}


	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");

	int NumElts = Mask.size();

	// Try to recognize shuffles that are just padding a subvector with zeros.
	int SubvecElts = 0;
	int Src = -1;
	for (int i = 0; i != NumElts; ++i) {
	if (Mask[i] >= 0) {
	// Grab the source from the first valid mask. All subsequent elements need
	// to use this same source.
	if (Src < 0)
	Src = Mask[i] / NumElts;
	if (Src != (Mask[i] / NumElts) \|\| (Mask[i] % NumElts) != i)
	break;
	}

	++SubvecElts;
	}
	assert(SubvecElts != NumElts && "Identity shuffle?");

	// Clip to a power 2.
	SubvecElts = PowerOf2Floor(SubvecElts);

	// Make sure the number of zeroable bits in the top at least covers the bits
	// not covered by the subvector.
	if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
	assert(Src >= 0 && "Expected a source!");
	MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
	Src == 0 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	DAG.getConstant(0, DL, VT),
	Extract, DAG.getIntPtrConstant(0, DL));
	}

	// Try a simple shift right with undef elements. Later we'll try with zeros.
	if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
	DAG))
	return Shift;

	// Try to match KSHIFTs.
	unsigned Offset = 0;
	for (SDValue V : { V1, V2 }) {
	unsigned Opcode;
	int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
	if (ShiftAmt >= 0) {
	MVT WideVT = VT;
	if ((!Subtarget.hasDQI() && NumElts == 8) \|\| NumElts < 8)
	WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
	DAG.getUNDEF(WideVT), V,
	DAG.getIntPtrConstant(0, DL));
	// Widened right shifts need two shifts to ensure we shift in zeroes.
	if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
	int WideElts = WideVT.getVectorNumElements();
	// Shift left to put the original vector in the MSBs of the new size.
	Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
	DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
	// Increase the shift amount to account for the left shift.
	ShiftAmt += WideElts - NumElts;
	}

	Res = DAG.getNode(Opcode, DL, WideVT, Res,
	DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	Offset += NumElts; // Increment for next iteration.
	}



	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
	break;
	case MVT::v32i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	assert(Subtarget.hasBWI() && "Expected AVX512BW support");
	ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
	break;
	case MVT::v64i1:
	// Fall back to scalarization. FIXME: We can do better if the shuffle
	// can be partitioned cleanly.
	if (!Subtarget.useBWIRegs())
	return SDValue();
	ExtVT = MVT::v64i8;
	break;
	}

	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
	Shuffle, ISD::SETGT);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> OrigMask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef &&
	any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
	SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
	(void)MaskUpperLimit;
	assert(llvm::all_of(OrigMask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt KnownUndef, KnownZero;
	computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

	APInt Zeroable = KnownUndef \| KnownZero;
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
	// Shuffle mask widening should not interfere with a broadcast opportunity
	// by obfuscating the operands with bitcasts.
	// TODO: Avoid lowering directly from this top-level function: make this
	// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
	Subtarget, DAG))
	return Broadcast;

	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	int NewNumElts = NumElements / 2;
	MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	if (V2IsZero) {
	// Modify the new Mask to take all zeros from the all-zero vector.
	// Choose indices that are blend-friendly.
	bool UsedZeroVector = false;
	assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
	"V2's non-undef elements are used?!");
	for (int i = 0; i != NewNumElts; ++i)
	if (WidenedMask[i] == SM_SentinelZero) {
	WidenedMask[i] = i + NewNumElts;
	UsedZeroVector = true;
	}
	// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
	// some elements to be undef.
	if (UsedZeroVector)
	V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
	}
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
	if (canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
	return V;

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is256BitVector())
	return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is512BitVector())
	return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (Is1BitVector)
	return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

	return SDValue();
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);

	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	MVT CondVT = Cond.getSimpleValueType();
	unsigned CondEltSize = Cond.getScalarValueSizeInBits();
	if (CondEltSize == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned NumElts = VT.getVectorNumElements();

	// Expand v32i16/v64i8 without BWI.
	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return SDValue();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	// Build a mask by testing the condition against zero.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
	DAG.getConstant(0, dl, CondVT),
	ISD::SETNE);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, LHS, RHS);
	}

	// SEXT/TRUNC cases where the mask doesn't match the destination size.
	if (CondEltSize != EltSize) {
	// If we don't have a sign splat, rely on the expansion.
	if (CondEltSize != DAG.ComputeNumSignBits(Cond))
	return SDValue();

	MVT NewCondSVT = MVT::getIntegerVT(EltSize);
	MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
	Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
	return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16: {
	// Bitcast everything to the vXi8 type and use a vXi8 vselect.
	MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
	Cond = DAG.getBitcast(CastVT, Cond);
	LHS = DAG.getBitcast(CastVT, LHS);
	RHS = DAG.getBitcast(CastVT, RHS);
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
	SDLoc dl(Op);

	if (!Vec.getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
	// we're going to zero extend the register or fold the store.
	if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
	!MayFoldIntoStore(Op))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\| isNullConstant(Idx)) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx);
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64)
	return Op;

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!IdxC) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = IdxC->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	// Extend to natively supported kshift.
	unsigned NumElems = VecVT.getVectorNumElements();
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!IdxC) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = IdxC->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(IdxVal, dl));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	// Copy into a k-register, extract to v1i1 and insert_subvector.
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);

	auto *N2C = dyn_cast<ConstantSDNode>(N2);
	if (!N2C \|\| N2C->getAPIntValue().uge(NumElts))
	return SDValue();
	uint64_t IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
	DAG.getTargetConstant(1, dl, MVT::i8));
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getIntPtrConstant(IdxIn128, dl));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// This will be just movd/movq/movss/movsd.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	EltVT == MVT::i64) {
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
	N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	return DAG.getBitcast(VT, N1);
	}
	}

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
	DAG.getTargetConstant(1, dl, MVT::i8));
	}
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
	DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
	"Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	uint64_t IdxVal = Op.getConstantOperandVal(1);

	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getTargetConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(
	const GlobalValue *GV, const unsigned char OpFlags) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	// GOTPCREL references must always use RIP.
	if (OpFlags == X86II::MO_GOTPCREL)
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
	SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const {
	// Unpack the global address or external symbol.
	const SDLoc &dl = SDLoc(Op);
	const GlobalValue *GV = nullptr;
	int64_t Offset = 0;
	const char *ExternalSym = nullptr;
	if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
	GV = G->getGlobal();
	Offset = G->getOffset();
	} else {
	const auto *ES = cast<ExternalSymbolSDNode>(Op);
	ExternalSym = ES->getSymbol();
	}

	// Calculate some flags for address lowering.
	const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags;
	if (ForCall)
	OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
	else
	OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
	bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
	bool NeedsLoad = isGlobalStubReference(OpFlags);

	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;

	if (GV) {
	// Create a target global address if this is a global. If possible, fold the
	// offset into the global address reference. Otherwise, ADD it on later.
	int64_t GlobalOffset = 0;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	std::swap(GlobalOffset, Offset);
	}
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
	} else {
	// If this is not a global address, this must be an external symbol.
	Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
	}

	// If this is a direct call, avoid the wrapper if we don't need to do any
	// loads or adds. This allows SDAG ISel to match direct calls.
	if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
	return Result;

	Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (HasPICReg) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (NeedsLoad)
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isOSWindows()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	/// TODO: Can this be moved to general expansion code?
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
	// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
	DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

	SDValue Hi, Lo;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	} else {
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	}

	return DAG.getMergeValues({ Lo, Hi }, dl);
	}

	static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((Op.getOpcode() == ISD::FSHL \|\| Op.getOpcode() == ISD::FSHR) &&
	"Unexpected funnel shift opcode!");

	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);

	bool IsFSHR = Op.getOpcode() == ISD::FSHR;

	if (VT.isVector()) {
	assert(Subtarget.hasVBMI2() && "Expected VBMI2");

	if (IsFSHR)
	std::swap(Op0, Op1);

	APInt APIntShiftAmt;
	if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
	uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
	return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
	Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
	}

	return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
	Op0, Op1, Amt);
	}
	assert(
	(VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Unexpected funnel shift type!");

	// Expand slow SHLD/SHRD cases if we are not optimizing for size.
	bool OptForSize = DAG.shouldOptForSize();
	bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

	// fshl(x,y,z) -> (((aext(x) << bw) \| zext(y)) << (z & (bw-1))) >> bw.
	// fshr(x,y,z) -> (((aext(x) << bw) \| zext(y)) >> (z & (bw-1))).
	if ((VT == MVT::i8 \|\| (ExpandFunnel && VT == MVT::i16)) &&
	!isa<ConstantSDNode>(Amt)) {
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
	SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
	Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
	Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
	SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
	Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
	if (IsFSHR) {
	Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
	} else {
	Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
	Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
	}
	return DAG.getZExtOrTrunc(Res, DL, VT);
	}

	if (VT == MVT::i8 \|\| ExpandFunnel)
	return SDValue();

	// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
	if (VT == MVT::i16) {
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
	DAG.getConstant(15, DL, Amt.getValueType()));
	unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
	return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
	}

	return Op;
	}

	// Try to use a packed vector operation to handle i64 on 32-bit targets when
	// AVX512DQ is enabled.
	static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_SINT_TO_FP \|\|
	Op.getOpcode() == ISD::STRICT_UINT_TO_FP \|\|
	Op.getOpcode() == ISD::UINT_TO_FP) &&
	"Unexpected opcode!");
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();

	if (!Subtarget.hasDQI() \|\| SrcVT != MVT::i64 \|\| Subtarget.is64Bit() \|\|
	(VT != MVT::f32 && VT != MVT::f64))
	return SDValue();

	// Pack the i64 into a vector, do the operation and extract.

	// Using 256-bit to ensure result is 128-bits for f32 case.
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecVT = MVT::getVectorVT(VT, NumElts);

	SDLoc dl(Op);
	SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
	if (IsStrict) {
	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
	{Op.getOperand(0), InVec});
	SDValue Chain = CvtVec.getValue(1);
	SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Value, Chain}, dl);
	}

	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
	const X86Subtarget &Subtarget) {
	switch (Opcode) {
	case ISD::SINT_TO_FP:
	// TODO: Handle wider types with AVX/AVX512.
	if (!Subtarget.hasSSE2() \|\| FromVT != MVT::v4i32)
	return false;
	// CVTDQ2PS or (V)CVTDQ2PD
	return ToVT == MVT::v4f32 \|\| (Subtarget.hasAVX() && ToVT == MVT::v4f64);

	case ISD::UINT_TO_FP:
	// TODO: Handle wider types and i64 elements.
	if (!Subtarget.hasAVX512() \|\| FromVT != MVT::v4i32)
	return false;
	// VCVTUDQ2PS or VCVTUDQ2PD
	return ToVT == MVT::v4f32 \|\| ToVT == MVT::v4f64;

	default:
	return false;
	}
	}

	/// Given a scalar cast operation that is extracted from a vector, try to
	/// vectorize the cast op followed by extraction. This will avoid an expensive
	/// round-trip between XMM and GPR.
	static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: This could be enhanced to handle smaller integer types by peeking
	// through an extend.
	SDValue Extract = Cast.getOperand(0);
	MVT DestVT = Cast.getSimpleValueType();
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Extract.getOperand(1)))
	return SDValue();

	// See if we have a 128-bit vector cast op for this type of cast.
	SDValue VecOp = Extract.getOperand(0);
	MVT FromVT = VecOp.getSimpleValueType();
	unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
	MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
	MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
	if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
	return SDValue();

	// If we are extracting from a non-zero element, first shuffle the source
	// vector to allow extracting from element zero.
	SDLoc DL(Cast);
	if (!isNullConstant(Extract.getOperand(1))) {
	SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
	Mask[0] = Extract.getConstantOperandVal(1);
	VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
	}
	// If the source vector is wider than 128-bits, extract the low part. Do not
	// create an unnecessarily wide vector cast op.
	if (FromVT != Vec128VT)
	VecOp = extract128BitVector(VecOp, 0, DAG, DL);

	// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
	// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
	SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
	/// try to vectorize the cast ops. This will avoid an expensive round-trip
	/// between XMM and GPR.
	static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Allow FP_TO_UINT.
	SDValue CastToInt = CastToFP.getOperand(0);
	MVT VT = CastToFP.getSimpleValueType();
	if (CastToInt.getOpcode() != ISD::FP_TO_SINT \|\| VT.isVector())
	return SDValue();

	MVT IntVT = CastToInt.getSimpleValueType();
	SDValue X = CastToInt.getOperand(0);
	MVT SrcVT = X.getSimpleValueType();
	if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
	return SDValue();

	// See if we have 128-bit vector cast instructions for this type of cast.
	// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
	if (!Subtarget.hasSSE2() \|\| (VT != MVT::f32 && VT != MVT::f64) \|\|
	IntVT != MVT::i32)
	return SDValue();

	unsigned SrcSize = SrcVT.getSizeInBits();
	unsigned IntSize = IntVT.getSizeInBits();
	unsigned VTSize = VT.getSizeInBits();
	MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
	MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
	MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

	// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
	unsigned ToIntOpcode =
	SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
	unsigned ToFPOpcode =
	IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

	// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
	//
	// We are not defining the high elements (for example, zero them) because
	// that could nullify any performance advantage that we hoped to gain from
	// this vector op hack. We do not expect any adverse effects (like denorm
	// penalties) with cast ops.
	SDLoc DL(CastToFP);
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
	SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
	SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
	}

	static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	bool IsStrict = Op->isStrictFPOpcode();
	MVT VT = Op->getSimpleValueType(0);
	SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

	if (Subtarget.hasDQI()) {
	assert(!Subtarget.hasVLX() && "Unexpected features");

	assert((Src.getSimpleValueType() == MVT::v2i64 \|\|
	Src.getSimpleValueType() == MVT::v4i64) &&
	"Unsupported custom type");

	// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
	assert((VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64) &&
	"Unexpected VT!");
	MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
	: DAG.getUNDEF(MVT::v8i64);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, DL);
	return Res;
	}

	bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP \|\|
	Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
	if (VT != MVT::v4f32 \|\| IsSigned)
	return SDValue();

	SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
	SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
	SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
	DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
	DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
	SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
	SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
	SmallVector<SDValue, 4> SignCvts(4);
	SmallVector<SDValue, 4> Chains(4);
	for (int i = 0; i != 4; ++i) {
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
	DAG.getIntPtrConstant(i, DL));
	if (IsStrict) {
	SignCvts[i] =
	DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
	{Op.getOperand(0), Elt});
	Chains[i] = SignCvts[i].getValue(1);
	} else {
	SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
	}
	}
	SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

	SDValue Slow, Chain;
	if (IsStrict) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
	{Chain, SignCvt, SignCvt});
	Chain = Slow.getValue(1);
	} else {
	Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
	}

	IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
	SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

	if (IsStrict)
	return DAG.getMergeValues({Cvt, Chain}, DL);

	return Cvt;
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
	return R;

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	// Note: Since v2f64 is a legal type. We don't need to zero extend the
	// source for strict FP.
	if (IsStrict)
	return DAG.getNode(
	X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
	{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT))});
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT == MVT::v2i64 \|\| SrcVT == MVT::v4i64)
	return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && UseSSEReg)
	return Op;
	if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
	return Op;

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	// SSE doesn't have an i16 conversion so we need to promote.
	if (SrcVT == MVT::i16 && (UseSSEReg \|\| VT == MVT::f128)) {
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{Chain, Ext});

	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
	}

	if (VT == MVT::f128)
	return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));

	SDValue ValueToStore = Src;
	if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getStoreSize();
	Align Alignment(Size);
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
	std::pair<SDValue, SDValue> Tmp =
	BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

	if (IsStrict)
	return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

	return Tmp.first;
	}

	std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
	EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
	MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
	// Build the FILD
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(DstVT);
	if (useSSE)
	Tys = DAG.getVTList(MVT::f80, MVT::Other);
	else
	Tys = DAG.getVTList(DstVT, MVT::Other);

	SDValue FILDOps[] = {Chain, Pointer};
	SDValue Result =
	DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
	Alignment, MachineMemOperand::MOLoad);
	Chain = Result.getValue(1);

	if (useSSE) {
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = DstVT.getStoreSize();
	int SSFI =
	MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue FSTOps[] = {Chain, Result, StackSlot};
	MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

	Chain =
	DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
	Result = DAG.getLoad(
	DstVT, DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	Chain = Result.getValue(1);
	}

	return { Result, Chain };
	}

	/// Horizontal vector math instructions may be slower than normal math with
	/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
	/// implementation, and likely shuffle complexity of the alternate sequence.
	static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsOptimizingSize = DAG.shouldOptForSize();
	bool HasFastHOps = Subtarget.hasFastHorizontalOps();
	return !IsSingleSource \|\| IsOptimizingSize \|\| HasFastHOps;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

	// Load the 64-bit value into an XMM register.
	SDValue XR1 =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	SDValue Sub;
	SDValue Chain;
	// TODO: Are there any fast-math-flags to propagate here?
	if (IsStrict) {
	Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), XR2F, CLod1});
	Chain = Sub.getValue(1);
	} else
	Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (!IsStrict && Subtarget.hasSSE3() &&
	shouldUseHorizontalOp(true, DAG, Subtarget)) {
	// FIXME: Do we need a STRICT version of FHADD?
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
	if (IsStrict) {
	Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
	{Chain, Shuffle, Sub});
	Chain = Result.getValue(1);
	} else
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
	}
	Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Result, Chain}, dl);

	return Result;
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Load),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	if (Op.getNode()->isStrictFPOpcode()) {
	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Chain = Op.getOperand(0);
	SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
	{Chain, Or, Bias});

	if (Op.getValueType() == Sub.getValueType())
	return Sub;

	// Handle final rounding.
	std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
	Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

	return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
	}

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	bool IsStrict = Op->isStrictFPOpcode();

	SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	// Let generic type legalization widen this.
	if (!IsStrict)
	return SDValue();
	// Otherwise pad the integer input with 0s and widen the operation.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getConstant(0, DL, MVT::v2i32));
	SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
	{Op.getOperand(0), N0});
	SDValue Chain = Res.getValue(1);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getMergeValues({Res, Chain}, DL);
	}

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), N0});
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
	}

	// Zero extend to 2i64, OR with the floating point representation of 2^52.
	// This gives us the floating point equivalent of 2^52 + the i32 integer
	// since double has 52-bits of mantissa. Then subtract 2^52 in floating
	// point leaving just our i32 integers in double format.
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);

	if (IsStrict)
	return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
	{Op.getOperand(0), Or, VBias});
	return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue V = Op->getOperand(IsStrict ? 1 : 0);
	MVT VecIntVT = V.getSimpleValueType();
	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	if (Subtarget.hasAVX512()) {
	// With AVX512, but not VLX we need to widen to get a 512-bit result type.
	assert(!Subtarget.hasVLX() && "Unexpected features");
	MVT VT = Op->getSimpleValueType(0);

	// v8i32->v8f64 is legal with AVX512 so just return it.
	if (VT == MVT::v8f64)
	return Op;

	assert((VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v4f64) &&
	"Unexpected VT!");
	MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
	MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	SDValue Tmp =
	IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
	V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
	{Op->getOperand(0), V});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, DL);
	return Res;
	}

	if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
	Op->getSimpleValueType(0) == MVT::v4f64) {
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
	Constant *Bias = ConstantFP::get(
	*DAG.getContext(),
	APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
	SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
	SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
	SDValue VBias = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
	MachineMemOperand::MOLoad);

	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
	DAG.getBitcast(MVT::v4i64, VBias));
	Or = DAG.getBitcast(MVT::v4f64, Or);

	if (IsStrict)
	return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
	{Op.getOperand(0), Or, VBias});
	return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
	}

	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for (0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFSub = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// NOTE: By using fsub of a positive constant instead of fadd of a negative
	// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
	// enabled. See PR24512.
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	// (float4) lo;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	// return (float4) lo + fhi;
	if (IsStrict) {
	SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
	{Op.getOperand(0), HighBitcast, VecCstFSub});
	return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
	{FHigh.getValue(1), LowBitcast, FHigh});
	}

	SDValue FHigh =
	DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
	SDValue N0 = Op.getOperand(OpNo);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	case MVT::v2i64:
	case MVT::v4i64:
	return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	unsigned OpNo = IsStrict ? 1 : 0;
	SDValue Src = Op.getOperand(OpNo);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op->getSimpleValueType(0);
	SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

	if (DstVT == MVT::f128)
	return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));

	if (DstVT.isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	// Promote i32 to i64 and use a signed conversion on 64-bit targets.
	if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
	Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
	{Chain, Src});
	return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
	}

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 =
	DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /Align/);
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MPI.getWithOffset(4), 4);
	std::pair<SDValue, SDValue> Tmp =
	BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
	if (IsStrict)
	return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

	return Tmp.first;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Src;
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	}
	SDValue Store =
	DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot };
	SDValue Fild =
	DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
	Align(8), MachineMemOperand::MOLoad);
	Chain = Fild.getValue(1);


	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
	APInt FF(64, 0x5F80000000000000ULL);
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF), PtrVT);
	Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	CPAlignment);
	Chain = Fudge.getValue(1);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	if (IsStrict) {
	SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
	{Chain, Fild, Fudge});
	// STRICT_FP_ROUND can't handle equal types.
	if (DstVT == MVT::f80)
	return Add;
	return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
	{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
	}
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an SDValue().
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence and return the
	// result.
	SDValue
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, SDValue &Chain) const {
	bool IsStrict = Op->isStrictFPOpcode();
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
	EVT TheVT = Value.getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i32. PR44019
	if (!IsSigned && DstTy != MVT::i64) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getStoreSize();
	int SSFI =
	MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust = (Value < Thresh) ? 0 : 0x80000000;
	// FltOfs = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value - FltOfs);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT);
	SDValue Cmp;
	if (IsStrict) {
	Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
	Chain, /IsSignaling/ true);
	Chain = Cmp.getValue(1);
	} else {
	Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
	}

	Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
	DAG.getConstant(0, DL, MVT::i64),
	DAG.getConstant(APInt::getSignMask(64),
	DL, MVT::i64));
	SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
	DAG.getConstantFP(0.0, DL, TheVT),
	ThreshVal);

	if (IsStrict) {
	Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
	{ Chain, Value, FltOfs });
	Chain = Value.getValue(1);
	} else
	Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
	}

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Chain, StackSlot };

	unsigned FLDSize = TheVT.getStoreSize();
	assert(FLDSize <= MemSize && "Stack slot not big enough");
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
	Chain = Value.getValue(1);
	}

	// Build the FP_TO_INT*_IN_MEM
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
	DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);

	SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
	Chain = Res.getValue(1);

	// If we need an unsigned fixup, XOR the result with adjust.
	if (UnsignedFixup)
	Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

	return Res;
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert((Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ZERO_EXTEND) &&
	"Unexpected extension opcode");
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

	if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
	assert(InVT == MVT::v32i8 && "Unexpected VT!");
	return splitVectorIntUnary(Op, DAG);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

	// Short-circuit if we can determine that each 128-bit half is the same value.
	// Otherwise, this is difficult to match and optimize.
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
	if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

	SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Opc == ISD::ZERO_EXTEND;
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	OpHi = DAG.getBitcast(HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
	static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert((VT == MVT::v16i8 \|\| VT == MVT::v16i16) && "Unexpected VT.");
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(8, dl));
	Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
	Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
	// avoids a constant pool load.
	if (VT.getVectorElementType() != MVT::i8) {
	SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
	return DAG.getNode(ISD::SRL, DL, VT, Extend,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	}

	// Extend VT if BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI()) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = DAG.getConstant(0, DL, WideVT);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(MVT::i8, NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");
	assert(DstVT.isVector() && "VT not a vector?");

	// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 64bits or greater from a
	// 128bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 64) != 0 \|\| (SrcSizeInBits % 128) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	if (!isPowerOf2_32(NumElems))
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (SrcVT.getScalarSizeInBits() > 16 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
	if (SrcVT.is128BitVector()) {
	InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
	In = DAG.getBitcast(InVT, In);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
	Res = extractSubVector(Res, 0, DAG, DL, 64);
	return DAG.getBitcast(DstVT, Res);
	}

	// Split lower/upper subvectors.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(In, DAG, DL);

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
	SmallVector<int, 64> Mask;
	int Scale = 64 / OutVT.getScalarSizeInBits();
	narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
	Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
	In, ISD::SETGT);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	assert((NumElts == 8 \|\| NumElts == 16) && "Unexpected number of elements");
	// We need to change to a wider element type that we have support for.
	// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
	// For 16 element vectors we extend to v16i32 unless we are explicitly
	// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
	// we need to split into two 8 element vectors which we can extend to v8i32,
	// truncate and concat the results. There's an additional complication if
	// the original type is v16i8. In that case we can't split the v16i8
	// directly, so we need to shuffle high elements to low and use
	// sign_extend_vector_inreg.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
	SDValue Lo, Hi;
	if (InVT == MVT::v16i8) {
	Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
	Hi = DAG.getVectorShuffle(
	InVT, DL, In, In,
	{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
	Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
	} else {
	assert(InVT == MVT::v16i16 && "Unexpected VT!");
	Lo = extract128BitVector(In, 0, DAG, DL);
	Hi = extract128BitVector(In, 8, DAG, DL);
	}
	// We're split now, just emit two truncates and a concat. The two
	// truncates will trigger legalization to come back to this function.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}
	// We either have 8 elements or we're allowed to use 512-bit vectors.
	// If we have VLX, we want to use the narrowest vector that can get the
	// job done so we use vXi32.
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
	if (Subtarget.hasDQI())
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
	return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	// If we're called by the type legalizer, handle a few cases.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(InVT)) {
	if ((InVT == MVT::v8i64 \|\| InVT == MVT::v16i32 \|\| InVT == MVT::v16i64) &&
	VT.is128BitVector()) {
	assert((InVT == MVT::v16i64 \|\| Subtarget.hasVLX()) &&
	"Unexpected subtarget!");
	// The default behavior is to truncate one step, concatenate, and then
	// truncate the remainder. We'd rather produce two 64-bit results and
	// concatenate those.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

	Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	// Otherwise let default legalization handle it.
	return SDValue();
	}

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
	assert(VT == MVT::v32i8 && "Unexpected VT!");
	return splitVectorIntUnary(Op, DAG);
	}

	// word to byte only under BWI. Otherwise we have to promoted to v16i32
	// and then truncate that. But we should only do that if we haven't been
	// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
	// handled by isel patterns.
	if (InVT != MVT::v16i16 \|\| Subtarget.hasBWI() \|\|
	Subtarget.canExtendTo512DQ())
	return Op;
	}

	unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known = DAG.computeKnownBits(In);
	if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	// Handle truncation of V256 to V128 using shuffles.
	assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
	// Use an AND to zero uppper bits for PACKUS.
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

	SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(8, DL));
	return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
	}

	llvm_unreachable("All 256->128 cases should have been handled above!");
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT \|\|
	Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
	MVT VT = Op->getSimpleValueType(0);
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	MVT SrcVT = Src.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.isVector()) {
	if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc;
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

	if (!IsSigned && !Subtarget.hasVLX()) {
	assert(Subtarget.useAVX512Regs() && "Unexpected features!");
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = Op.getOpcode();
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
	: DAG.getUNDEF(MVT::v8f64);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res, Chain;
	if (IsStrict) {
	Res =
	DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Opc, dl, ResVT, Src);
	}

	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
	if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
	assert(!IsSigned && "Expected unsigned conversion!");
	assert(Subtarget.useAVX512Regs() && "Requires avx512f");
	return Op;
	}

	// Widen vXi32 fp_to_uint with avx512f to 512-bit source.
	if ((VT == MVT::v4i32 \|\| VT == MVT::v8i32) &&
	(SrcVT == MVT::v4f64 \|\| SrcVT == MVT::v4f32 \|\| SrcVT == MVT::v8f32)) {
	assert(!IsSigned && "Expected unsigned conversion!");
	assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
	"Unexpected features!");
	MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
	MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp =
	IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) &&
	(SrcVT == MVT::v2f64 \|\| SrcVT == MVT::v4f64 \|\| SrcVT == MVT::v4f32)) {
	assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
	!Subtarget.hasVLX() && "Unexpected features!");
	MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
	// Need to concat with zero vector for strict fp to avoid spurious
	// exceptions.
	// TODO: Should we just do this for non-strict as well?
	SDValue Tmp =
	IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
	{Op->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
	}

	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);
	return Res;
	}

	if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
	if (!Subtarget.hasVLX()) {
	// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
	// legalizer and then widened again by vector op legalization.
	if (!IsStrict)
	return SDValue();

	SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
	SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
	{Src, Zero, Zero, Zero});
	Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
	{Op->getOperand(0), Tmp});
	SDValue Chain = Tmp.getValue(1);
	Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
	DAG.getIntPtrConstant(0, dl));
	if (IsStrict)
	return DAG.getMergeValues({Tmp, Chain}, dl);
	return Tmp;
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
	SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	if (IsStrict) {
	unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
	: X86ISD::STRICT_CVTTP2UI;
	return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
	}
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	return DAG.getNode(Opc, dl, VT, Tmp);
	}

	return SDValue();
	}

	assert(!VT.isVector());

	bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

	if (!IsSigned && UseSSEReg) {
	// Conversions from f32/f64 with AVX512 should be legal.
	if (Subtarget.hasAVX512())
	return Op;

	// Use default expansion for i64.
	if (VT == MVT::i64)
	return SDValue();

	assert(VT == MVT::i32 && "Unexpected VT!");

	// Promote i32 to i64 and use a signed operation on 64-bit targets.
	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i32. PR44019
	if (Subtarget.is64Bit()) {
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
	{ Op.getOperand(0), Src });
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	if (IsStrict)
	return DAG.getMergeValues({ Res, Chain }, dl);
	return Res;
	}

	// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
	// use fisttp which will be handled later.
	if (!Subtarget.hasSSE3())
	return SDValue();
	}

	// Promote i16 to i32 if we can use a SSE operation or the type is f128.
	// FIXME: This does not generate an invalid exception if the input does not
	// fit in i16. PR44019
	if (VT == MVT::i16 && (UseSSEReg \|\| SrcVT == MVT::f128)) {
	assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
	{ Op.getOperand(0), Src });
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	if (IsStrict)
	return DAG.getMergeValues({ Res, Chain }, dl);
	return Res;
	}

	// If this is a FP_TO_SINT using SSEReg we're done.
	if (UseSSEReg && IsSigned)
	return Op;

	// fp128 needs to use a libcall.
	if (SrcVT == MVT::f128) {
	RTLIB::Libcall LC;
	if (IsSigned)
	LC = RTLIB::getFPTOSINT(SrcVT, VT);
	else
	LC = RTLIB::getFPTOUINT(SrcVT, VT);

	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
	SDLoc(Op), Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	// Fall back to X87.
	SDValue Chain;
	if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
	if (IsStrict)
	return DAG.getMergeValues({V, Chain}, dl);
	return V;
	}

	llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
	}

	SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// If the source is in an SSE register, the node is Legal.
	if (isScalarFPTypeInSSEReg(SrcVT))
	return Op;

	return LRINT_LLRINTHelper(Op.getNode(), DAG);
	}

	SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
	SelectionDAG &DAG) const {
	EVT DstVT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	SDLoc DL(N);
	SDValue Chain = DAG.getEntryNode();

	bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

	// If we're converting from SSE, the stack slot needs to hold both types.
	// Otherwise it only needs to hold the DstVT.
	EVT OtherVT = UseSSE ? SrcVT : DstVT;
	SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

	if (UseSSE) {
	assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
	Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Chain, StackPtr };

	Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
	/Align/ None, MachineMemOperand::MOLoad);
	Chain = Src.getValue(1);
	}

	SDValue StoreOps[] = { Chain, Src, StackPtr };
	Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
	StoreOps, DstVT, MPI, /Align/ None,
	MachineMemOperand::MOStore);

	return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
	}

	SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(IsStrict ? 1 : 0);
	MVT SVT = In.getSimpleValueType();

	if (VT == MVT::f128) {
	RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
	return LowerF128Call(Op, DAG, LC);
	}

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	SDValue Res =
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
	{Op->getOperand(0), Res});
	return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
	}

	SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
	bool IsStrict = Op->isStrictFPOpcode();

	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(IsStrict ? 1 : 0);
	MVT SVT = In.getSimpleValueType();

	// It's legal except when f128 is involved
	if (SVT != MVT::f128)
	return Op;

	RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.

	SDLoc dl(Op);
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
	dl, Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
	"Unexpected VT!");

	SDLoc dl(Op);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
	DAG.getConstant(0, dl, MVT::v8i16), Src,
	DAG.getIntPtrConstant(0, dl));

	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
	{Op.getOperand(0), Res});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
	}

	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);

	return Res;
	}

	static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
	bool IsStrict = Op->isStrictFPOpcode();
	SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
	assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
	"Unexpected VT!");

	SDLoc dl(Op);
	SDValue Res, Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
	DAG.getConstantFP(0, dl, MVT::v4f32), Src,
	DAG.getIntPtrConstant(0, dl));
	Res = DAG.getNode(
	X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
	{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
	Chain = Res.getValue(1);
	} else {
	// FIXME: Should we use zeros for upper elements for non-strict?
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
	Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
	DAG.getTargetConstant(4, dl, MVT::i32));
	}

	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
	DAG.getIntPtrConstant(0, dl));

	if (IsStrict)
	return DAG.getMergeValues({Res, Chain}, dl);

	return Res;
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If both operands have other uses, this is probably not profitable.
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (!LHS.hasOneUse() && !RHS.hasOneUse())
	return Op;

	// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
	bool IsFP = Op.getSimpleValueType().isFloatingPoint();
	if (IsFP && !Subtarget.hasSSE3())
	return Op;
	if (!IsFP && !Subtarget.hasSSSE3())
	return Op;

	// Extract from a common vector.
	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	LHS.getOperand(0) != RHS.getOperand(0) \|\|
	!isa<ConstantSDNode>(LHS.getOperand(1)) \|\|
	!isa<ConstantSDNode>(RHS.getOperand(1)) \|\|
	!shouldUseHorizontalOp(true, DAG, Subtarget))
	return Op;

	// Allow commuted 'hadd' ops.
	// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
	unsigned HOpcode;
	switch (Op.getOpcode()) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default:
	llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
	}
	unsigned LExtIndex = LHS.getConstantOperandVal(1);
	unsigned RExtIndex = RHS.getConstantOperandVal(1);
	if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
	(HOpcode == X86ISD::HADD \|\| HOpcode == X86ISD::FHADD))
	std::swap(LExtIndex, RExtIndex);

	if ((LExtIndex & 1) != 0 \|\| RExtIndex != (LExtIndex + 1))
	return Op;

	SDValue X = LHS.getOperand(0);
	EVT VecVT = X.getValueType();
	unsigned BitWidth = VecVT.getSizeInBits();
	unsigned NumLanes = BitWidth / 128;
	unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
	assert((BitWidth == 128 \|\| BitWidth == 256 \|\| BitWidth == 512) &&
	"Not expecting illegal vector widths here");

	// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
	// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
	SDLoc DL(Op);
	if (BitWidth == 256 \|\| BitWidth == 512) {
	unsigned LaneIdx = LExtIndex / NumEltsPerLane;
	X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
	LExtIndex %= NumEltsPerLane;
	}

	// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
	// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
	SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
	DAG.getIntPtrConstant(LExtIndex / 2, DL));
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
	assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&
	"Only expecting float/double");
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
	}

	/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
	/// This mode isn't supported in hardware on X86. But as long as we aren't
	/// compiling with trapping math, we can emulate this with
	/// floor(X + copysign(nextafter(0.5, 0.0), X)).
	static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// N0 += copysign(nextafter(0.5, 0.0), N0)
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	bool Ignored;
	APFloat Point5Pred = APFloat(0.5f);
	Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
	Point5Pred.next(/nextDown/true);

	SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
	DAG.getConstantFP(Point5Pred, dl, VT), N0);
	N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

	// Truncate the result to remove fraction.
	return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFABSorFNEG");

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	unsigned EltBits = VT.getScalarSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
	APInt::getSignMask(EltBits);
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp = IsFABS ? X86ISD::FAND :
	IsFNABS ? X86ISD::FOR :
	X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
	/// style scalarized (associative) reduction patterns. Partial reductions
	/// are supported when the pointer SrcMask is non-null.
	/// TODO - move this to SelectionDAG?
	static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
	SmallVectorImpl<SDValue> &SrcOps,
	SmallVectorImpl<APInt> *SrcMask = nullptr) {
	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, APInt> SrcOpMap;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	assert(Op.getOpcode() == unsigned(BinOp) &&
	"Unexpected bit reduction opcode");
	Opnds.push_back(Op.getOperand(0));
	Opnds.push_back(Op.getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all BinOp operands.
	if (I->getOpcode() == unsigned(BinOp)) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	// Quit if without a constant index.
	auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
	if (!Idx)
	return false;

	SDValue Src = I->getOperand(0);
	DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
	if (M == SrcOpMap.end()) {
	VT = Src.getValueType();
	// Quit if not the same type.
	if (SrcOpMap.begin() != SrcOpMap.end() &&
	VT != SrcOpMap.begin()->first.getValueType())
	return false;
	unsigned NumElts = VT.getVectorNumElements();
	APInt EltCount = APInt::getNullValue(NumElts);
	M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
	SrcOps.push_back(Src);
	}

	// Quit if element already used.
	unsigned CIdx = Idx->getZExtValue();
	if (M->second[CIdx])
	return false;
	M->second.setBit(CIdx);
	}

	if (SrcMask) {
	// Collect the source partial masks.
	for (SDValue &SrcOp : SrcOps)
	SrcMask->push_back(SrcOpMap[SrcOp]);
	} else {
	// Quit if not all elements are used.
	for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
	E = SrcOpMap.end();
	I != E; ++I) {
	if (!I->second.isAllOnesValue())
	return false;
	}
	}

	return true;
	}

	// Helper function for comparing all bits of a vector against zero.
	static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
	const APInt &Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, X86::CondCode &X86CC) {
	EVT VT = V.getValueType();
	assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
	"Element Mask vs Vector bitwidth mismatch");

	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && "Unsupported ISD::CondCode");
	X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

	auto MaskBits = [&](SDValue Src) {
	if (Mask.isAllOnesValue())
	return Src;
	EVT SrcVT = Src.getValueType();
	SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
	return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
	};

	// For sub-128-bit vector, cast to (legal) integer and compare with zero.
	if (VT.getSizeInBits() < 128) {
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
	if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
	return SDValue();
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	DAG.getBitcast(IntVT, MaskBits(V)),
	DAG.getConstant(0, DL, IntVT));
	}

	// Quit if not splittable to 128/256-bit vector.
	if (!isPowerOf2_32(VT.getSizeInBits()))
	return SDValue();

	// Split down to 128/256-bit vector.
	unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
	while (VT.getSizeInBits() > TestSize) {
	auto Split = DAG.SplitVector(V, DL);
	VT = Split.first.getValueType();
	V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
	}

	bool UsePTEST = Subtarget.hasSSE41();
	if (UsePTEST) {
	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
	V = DAG.getBitcast(TestVT, MaskBits(V));
	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
	}

	// Without PTEST, a masked v2i64 or-reduction is not faster than
	// scalarization.
	if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
	return SDValue();

	V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
	V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
	getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
	DAG.getConstant(0xFFFF, DL, MVT::i32));
	}

	// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
	// CMP(MOVMSK(PCMPEQB(X,0))).
	static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
	const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &X86CC) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && "Unsupported ISD::CondCode");

	if (!Subtarget.hasSSE2() \|\| !Op->hasOneUse())
	return SDValue();

	// Check whether we're masking/truncating an OR-reduction result, in which
	// case track the masked bits.
	APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
	switch (Op.getOpcode()) {
	case ISD::TRUNCATE: {
	SDValue Src = Op.getOperand(0);
	Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
	Op.getScalarValueSizeInBits());
	Op = Src;
	break;
	}
	case ISD::AND: {
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Mask = Cst->getAPIntValue();
	Op = Op.getOperand(0);
	}
	break;
	}
	}

	SmallVector<SDValue, 8> VecIns;
	if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
	EVT VT = VecIns[0].getValueType();
	assert(llvm::all_of(VecIns,
	[VT](SDValue V) { return VT == V.getValueType(); }) &&
	"Reduction source vector mismatch");

	// Quit if less than 128-bits or not splittable to 128/256-bit vector.
	if (VT.getSizeInBits() < 128 \|\| !isPowerOf2_32(VT.getSizeInBits()))
	return SDValue();

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
	Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is
	// only 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
	}

	X86::CondCode CCode;
	if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
	DAG, CCode)) {
	X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
	return V;
	}
	}

	if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ISD::NodeType BinOp;
	if (SDValue Match =
	DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
	X86::CondCode CCode;
	if (SDValue V =
	LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
	X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
	return V;
	}
	}
	}

	return SDValue();
	}

	/// return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Transform to an x86-specific ALU node with flags if there is a chance of
	// using an RMW op or only the flags are used. Otherwise, leave
	// the node alone and emit a 'cmp' or 'test' instruction.
	static bool isProfitableToUseFlagOp(SDValue Op) {
	for (SDNode *U : Op->uses())
	if (U->getOpcode() != ISD::CopyToReg &&
	U->getOpcode() != ISD::SETCC &&
	U->getOpcode() != ISD::STORE)
	return false;

	return true;
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	SDValue ArithOp = Op;

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better.
	if (!hasNonFlagsUse(Op))
	break;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	if (!isProfitableToUseFlagOp(Op))
	break;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::ADD: Opcode = X86ISD::ADD; break;
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: Opcode = X86ISD::OR; break;
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	case ISD::SSUBO:
	case ISD::USUBO: {
	// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
	Op->getOperand(1)).getValue(1);
	}
	default:
	break;
	}

	if (Opcode == 0) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

	EVT CmpVT = Op0.getValueType();

	assert((CmpVT == MVT::i8 \|\| CmpVT == MVT::i16 \|\|
	CmpVT == MVT::i32 \|\| CmpVT == MVT::i64) && "Unexpected VT!");

	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
	!DAG.getMachineFunction().getFunction().hasMinSize()) {
	ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
	ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
	// Don't do this if the immediate can fit in 8-bits.
	if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) \|\|
	(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
	unsigned ExtendOp =
	isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	if (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE) {
	// For equality comparisons try to use SIGN_EXTEND if the input was
	// truncate from something with enough sign bits.
	if (Op0.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op0.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	} else if (Op1.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op1.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	}
	}

	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
	}
	}

	// Try to shrink i64 compares if the input has enough zero bits.
	// FIXME: Do this for non-constant compares for constant on LHS?
	if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
	Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
	cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
	DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
	}

	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
	Op0.hasOneUse() && (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE)) {
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
	return Add.getValue(1);
	}

	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
	Op1.hasOneUse() && (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE)) {
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
	return Add.getValue(1);
	}

	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return Sub.getValue(1);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	// There is no FSQRT for 512-bits, but there is RSQRT14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	// There is no FSQRT for 512-bits, but there is RCP14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	SDValue
	X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	assert((Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()) &&
	"Unexpected divisor!");

	// Only perform this transform if CMOV is supported otherwise the select
	// below will become a branch.
	if (!Subtarget.hasCMov())
	return SDValue();

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	// FIXME: Support i8.
	if (VT != MVT::i16 && VT != MVT::i32 &&
	!(Subtarget.is64Bit() && VT == MVT::i64))
	return SDValue();

	unsigned Lg2 = Divisor.countTrailingZeros();

	// If the divisor is 2 or -2, the default expansion is better.
	if (Lg2 == 1)
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
	SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

	// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
	SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CMov.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	/// Returns the BT node and the condition code needed to use it.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	SDValue &X86CC) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue Src, BitNo;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known = DAG.computeKnownBits(Op0);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	Src = Op1;
	BitNo = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	Src = AndLHS.getOperand(0);
	BitNo = AndLHS.getOperand(1);
	} else {
	// Use BT if the immediate can't be encoded in a TEST instruction or we
	// are optimizing for size and the immedaite won't fit in a byte.
	bool OptForSize = DAG.shouldOptForSize();
	if ((!isUInt<32>(AndRHSVal) \|\| (OptForSize && !isUInt<8>(AndRHSVal))) &&
	isPowerOf2_64(AndRHSVal)) {
	Src = AndLHS;
	BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
	Src.getValueType());
	}
	}
	}

	// No patterns found, give up.
	if (!Src.getNode())
	return SDValue();

	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
	dl, MVT::i8);
	return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1, bool &IsAlwaysSignaling) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	switch (SetCCOpcode) {
	default:
	IsAlwaysSignaling = true;
	break;
	case ISD::SETEQ:
	case ISD::SETOEQ:
	case ISD::SETUEQ:
	case ISD::SETNE:
	case ISD::SETONE:
	case ISD::SETUNE:
	case ISD::SETO:
	case ISD::SETUO:
	IsAlwaysSignaling = false;
	break;
	}

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
	assert(Op.getOperand(0).getValueType().isInteger() &&
	VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");

	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS Lo/Hi vectors
	SDValue LHS1, LHS2;
	std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);

	// Extract the RHS Lo/Hi vectors
	SDValue RHS1, RHS2;
	std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
	DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

	// Prefer SETGT over SETLT.
	if (SetCCOpcode == ISD::SETLT) {
	SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
	std::swap(Op0, Op1);
	}

	return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
	}

	/// Given a buildvector constant, return a new vector constant with each element
	/// incremented or decremented. If incrementing or decrementing would result in
	/// unsigned overflow or underflow or this is not a simple vector constant,
	/// return an empty value.
	static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
	auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
	if (!BV)
	return SDValue();

	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 8> NewVecC;
	SDLoc DL(V);
	for (unsigned i = 0; i < NumElts; ++i) {
	auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EltVT)
	return SDValue();

	// Avoid overflow/underflow.
	const APInt &EltC = Elt->getAPIntValue();
	if ((IsInc && EltC.isMaxValue()) \|\| (!IsInc && EltC.isNullValue()))
	return SDValue();

	NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
	}

	return DAG.getBuildVector(VT, DL, NewVecC);
	}

	/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	/// Op0 u<= Op1:
	/// t = psubus Op0, Op1
	/// pcmpeq t, <0..0>
	static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
	ISD::CondCode Cond, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	MVT VET = VT.getVectorElementType();
	if (VET != MVT::i8 && VET != MVT::i16)
	return SDValue();

	switch (Cond) {
	default:
	return SDValue();
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	return SDValue();
	SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /IsInc/false);
	if (!ULEOp1)
	return SDValue();
	Op1 = ULEOp1;
	break;
	}
	case ISD::SETUGT: {
	// If the comparison is against a constant, we can turn this into a setuge.
	// This is beneficial because materializing a constant 0 for the PCMPEQ is
	// probably cheaper than XOR+PCMPGT using 2 different vector constants:
	// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
	SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /IsInc/true);
	if (!UGEOp1)
	return SDValue();
	Op1 = Op0;
	Op0 = UGEOp1;
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE:
	std::swap(Op0, Op1);
	break;
	case ISD::SETULE:
	break;
	}

	SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
	return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	DAG.getConstant(0, dl, VT));
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC \|\|
	Op.getOpcode() == ISD::STRICT_FSETCCS;
	SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
	SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
	SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
	MVT VT = Op->getSimpleValueType(0);
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op1.getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

	// If we have a strict compare with a vXi1 result and the input is 128/256
	// bits we can't use a masked compare unless we have VLX. If we use a wider
	// compare like we do for non-strict, we might trigger spurious exceptions
	// from the upper elements. Instead emit a AVX compare and convert to mask.
	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
	(!IsStrict \|\| Subtarget.hasVLX() \|\|
	Op0.getSimpleValueType().is512BitVector())) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
	} else {
	Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	SDValue Cmp;
	bool IsAlwaysSignaling;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
	if (!Subtarget.hasAVX()) {
	// TODO: We could use following steps to handle a quiet compare with
	// signaling encodings.
	// 1. Get ordered masks from a quiet ISD::SETO
	// 2. Use the masks to mask potential unordered elements in operand A, B
	// 3. Get the compare results of masked A, B
	// 4. Calculating final result using the mask and result from 3
	// But currently, we just fall back to scalar operations.
	if (IsStrict && IsAlwaysSignaling && !IsSignaling)
	return SDValue();

	// Insert an extra signaling instruction to raise exception.
	if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
	SDValue SignalCmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
	// FIXME: It seems we need to update the flags of all new strict nodes.
	// Otherwise, mayRaiseFPException in MI will return false due to
	// NoFPExcept = false by default. However, I didn't find it in other
	// patches.
	SignalCmp->setFlags(Op->getFlags());
	Chain = SignalCmp.getValue(1);
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	if (SSECC >= 8) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0, Cmp1;
	if (IsStrict) {
	Cmp0 = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
	Cmp1 = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
	Cmp1.getValue(1));
	} else {
	Cmp0 = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
	Cmp1 = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
	}
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	if (IsStrict) {
	Cmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
	Chain = Cmp.getValue(1);
	} else
	Cmp = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
	}
	} else {
	// Handle all other FP comparisons here.
	if (IsStrict) {
	// Make a flip on already signaling CCs before setting bit 4 of AVX CC.
	SSECC \|= (IsAlwaysSignaling ^ IsSignaling) << 4;
	Cmp = DAG.getNode(
	Opc, dl, {VT, MVT::Other},
	{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
	Chain = Cmp.getValue(1);
	} else
	Cmp = DAG.getNode(
	Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
	}

	if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
	// We emitted a compare with an XMM/YMM result. Finish converting to a
	// mask register using a vptestm.
	EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
	Cmp = DAG.getBitcast(CastVT, Cmp);
	Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
	DAG.getConstant(0, dl, CastVT), ISD::SETNE);
	} else {
	// If this is SSE/AVX CMPP, bitcast the result back to integer to match
	// the result type of SETCC. The bitcast is expected to be optimized
	// away during combining/isel.
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
	}

	if (IsStrict)
	return DAG.getMergeValues({Cmp, Chain}, dl);

	return Cmp;
	}

	assert(!IsStrict && "Strict SETCC only handles FP operands.");

	MVT VTOp0 = Op0.getSimpleValueType();
	(void)VTOp0;
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	assert((VTOp0.getScalarSizeInBits() >= 32 \|\| Subtarget.hasBWI()) &&
	"Unexpected operand type");
	return LowerIntVSETCC_AVX512(Op, DAG);
	}

	// Lower using XOP integer comparisons.
	if (VT.is128BitVector() && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getTargetConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
	if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
	Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
	ConstantSDNode *C1 = isConstOrConstSplat(Op1);
	if (C1 && C1->getAPIntValue().isPowerOf2()) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

	SDValue Result = Op0.getOperand(0);
	Result = DAG.getNode(ISD::SHL, dl, VT, Result,
	DAG.getConstant(ShiftAmt, dl, VT));
	Result = DAG.getNode(ISD::SRA, dl, VT, Result,
	DAG.getConstant(BitWidth - 1, dl, VT));
	return Result;
	}
	}

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitIntVSETCC(Op, DAG);

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8) {
	assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
	return splitIntVSETCC(Op, DAG);
	}

	// If this is a SETNE against the signed minimum value, change it to SETGT.
	// If this is a SETNE against the signed maximum value, change it to SETLT.
	// which will be swapped to SETGT.
	// Otherwise we use PCMPEQ+invert.
	APInt ConstValue;
	if (Cond == ISD::SETNE &&
	ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
	if (ConstValue.isMinSignedValue())
	Cond = ISD::SETGT;
	else if (ConstValue.isMaxSignedValue())
	Cond = ISD::SETLT;
	}

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for unsigned compares.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (ISD::isUnsignedIntSetCC(Cond) &&
	(FlipSigns \|\| ISD::isTrueWhenEqual(Cond)) &&
	TLI.isOperationLegal(ISD::UMIN, VT)) {
	// If we have a constant operand, increment/decrement it and change the
	// condition to avoid an invert.
	if (Cond == ISD::SETUGT) {
	// X > C --> X >= (C+1) --> X == umax(X, C+1)
	if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /IsInc/true)) {
	Op1 = UGTOp1;
	Cond = ISD::SETUGE;
	}
	}
	if (Cond == ISD::SETULT) {
	// X < C --> X <= (C-1) --> X == umin(X, C-1)
	if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /IsInc/false)) {
	Op1 = ULTOp1;
	Cond = ISD::SETULE;
	}
	}
	bool Invert = false;
	unsigned Opc;
	switch (Cond) {
	default: llvm_unreachable("Unexpected condition code");
	case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETULE: Opc = ISD::UMIN; break;
	case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: Opc = ISD::UMAX; break;
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to use SUBUS and PCMPEQ.
	if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
	return V;

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
	// the odd elements over the even elements.
	if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
	Op0 = DAG.getConstant(0, dl, MVT::v4i32);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	static const int MaskHi[] = { 1, 1, 3, 3 };
	SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	return DAG.getBitcast(VT, Result);
	}

	if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	static const int MaskHi[] = { 1, 1, 3, 3 };
	SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	return DAG.getBitcast(VT, Result);
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
	} else {
	SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

	// Cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
	static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue &X86CC) {
	// Only support equality comparisons.
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return SDValue();

	// Must be a bitcast from vXi1.
	if (Op0.getOpcode() != ISD::BITCAST)
	return SDValue();

	Op0 = Op0.getOperand(0);
	MVT VT = Op0.getSimpleValueType();
	if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
	!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
	!(Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1)))
	return SDValue();

	X86::CondCode X86Cond;
	if (isNullConstant(Op1)) {
	X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	} else if (isAllOnesConstant(Op1)) {
	// C flag is set for all ones.
	X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
	} else
	return SDValue();

	// If the input is an AND, we can combine it's operands into the KTEST.
	bool KTestable = false;
	if (Subtarget.hasDQI() && (VT == MVT::v8i1 \|\| VT == MVT::v16i1))
	KTestable = true;
	if (Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1))
	KTestable = true;
	if (!isNullConstant(Op1))
	KTestable = false;
	if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
	SDValue LHS = Op0.getOperand(0);
	SDValue RHS = Op0.getOperand(1);
	X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
	}

	// If the input is an OR, we can combine it's operands into the KORTEST.
	SDValue LHS = Op0;
	SDValue RHS = Op0;
	if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
	LHS = Op0.getOperand(0);
	RHS = Op0.getOperand(1);
	}

	X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	}

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
	ISD::CondCode CC, const SDLoc &dl,
	SelectionDAG &DAG,
	SDValue &X86CC) const {
	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
	return BT;
	}

	// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
	// TODO: We could do AND tree with all 1s as well by using the C flag.
	if (isNullConstant(Op1) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE))
	if (SDValue CmpZ =
	MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
	return CmpZ;

	// Try to lower using KORTEST or KTEST.
	if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
	return Test;

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

	X86CC = Op0.getOperand(0);
	if (Invert) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	}

	return Op0.getOperand(1);
	}
	}

	// Try to use the carry flag from the add in place of an separate CMP for:
	// (seteq (add X, -1), -1). Similar for setne.
	if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
	Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (isProfitableToUseFlagOp(Op0)) {
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

	SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
	Op0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
	X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
	return SDValue(New.getNode(), 1);
	}
	}

	X86::CondCode CondCode =
	TranslateX86CC(CC, dl, /IsFP/ false, Op0, Op1, DAG);
	assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");

	SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
	X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
	return EFLAGS;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC \|\|
	Op.getOpcode() == ISD::STRICT_FSETCCS;
	MVT VT = Op->getSimpleValueType(0);

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
	SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
	SDLoc dl(Op);
	ISD::CondCode CC =
	cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets handled by emitFlagsForSetcc.
	if (Op0.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
	Op.getOpcode() == ISD::STRICT_FSETCCS);

	// If softenSetCCOperands returned a scalar, use it.
	if (!Op1.getNode()) {
	assert(Op0.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	if (IsStrict)
	return DAG.getMergeValues({Op0, Chain}, dl);
	return Op0;
	}
	}

	if (Op0.getSimpleValueType().isInteger()) {
	SDValue X86CC;
	SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
	SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
	return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
	}

	// Handle floating point.
	X86::CondCode CondCode = TranslateX86CC(CC, dl, /IsFP/ true, Op0, Op1, DAG);
	if (CondCode == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS;
	if (IsStrict) {
	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
	EFLAGS =
	DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
	dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
	Chain = EFLAGS.getValue(1);
	} else {
	EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
	}

	SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
	SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
	return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getAllOnesConstant(DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	// This function returns three things: the arithmetic computation itself
	// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
	// flag and the condition code define the case in which the arithmetic
	// computation overflows.
	static std::pair<SDValue, SDValue>
	getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
	assert(Op.getResNo() == 0 && "Unexpected result number!");
	SDValue Value, Overflow;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned BaseOp = 0;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
	break;
	case ISD::SSUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO:
	BaseOp = X86ISD::UMUL;
	Cond = X86::COND_O;
	break;
	}

	if (BaseOp) {
	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}

	return std::make_pair(Value, Overflow);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDLoc DL(Op);
	X86::CondCode Cond;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

	SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
	assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::FCMP)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::OR \|\| Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	bool IsAlwaysSignaling;
	unsigned SSECC =
	translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
	CondOp0, CondOp1, IsAlwaysSignaling);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
	DAG.getTargetConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getTargetConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a +0.0 constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
	!isNullFPConstant(Op2)) {
	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.
	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	SDValue CmpOp0 = Cmp.getOperand(0);
	unsigned CondCode = Cond.getConstantOperandVal(0);

	// Special handling for __builtin_ffs(X) - 1 pattern which looks like
	// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
	// handle to keep the CMP with 0. This should be removed by
	// optimizeCompareInst by using the flags from the BSR/TZCNT used for the
	// cttz_zero_undef.
	auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
	return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
	Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
	};
	if (Subtarget.hasCMov() && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) \|\|
	(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
	// Keep Cmp.
	} else if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
	Zero = DAG.getConstant(0, DL, Op.getValueType());
	return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
	}

	Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));

	SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Cmp.getOpcode() == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res =
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Exclude CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// Or finally, promote i8 cmovs if we have CMOV,
	// or i16 cmovs if it won't prevent folding a load.
	// FIXME: we should not limit promotion of i8 case to only when the CMOV is
	// legal, but EmitLoweredSelect() can not deal with these extensions
	// being inserted between two CMOV's. (in i16 case too TBN)
	// https://bugs.llvm.org/show_bug.cgi?id=40974
	if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) \|\|
	(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
	!MayFoldLoad(Op2))) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is i8/i16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
	} else {
	SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
	SDValue Zero = DAG.getConstant(0, dl, WideVT);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasAVX()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (InVT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * NumElts;
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	InVT = In.getSimpleValueType();
	}

	// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");

	if (InVT.getVectorNumElements() != NumElts)
	return DAG.getNode(Op.getOpcode(), dl, VT, In);

	// FIXME: Apparently we create inreg operations that could be regular
	// extends.
	unsigned ExtOpc =
	Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
	if (Subtarget.hasAVX()) {
	assert(VT.is256BitVector() && "256-bit vector expected");
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	int HalfNumElts = HalfVT.getVectorNumElements();

	unsigned NumSrcElts = InVT.getVectorNumElements();
	SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
	for (int i = 0; i != HalfNumElts; ++i)
	HiMask[i] = HalfNumElts + i;

	SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
	Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	// We should only get here for sign extend.
	assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
	assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	SDValue SignExt = Curr;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	if (InVT != MVT::v4i32) {
	MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

	unsigned DestWidth = DestVT.getScalarSizeInBits();
	unsigned Scale = DestWidth / InSVT.getSizeInBits();

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned DestElts = DestVT.getVectorNumElements();

	// Build a shuffle mask that takes each input element and places it in the
	// MSBs of the new element size.
	SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != DestElts; ++i)
	Mask[i * Scale + (Scale - 1)] = i;

	Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
	Curr = DAG.getBitcast(DestVT, Curr);

	unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
	DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
	}

	if (VT == MVT::v2i64) {
	assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
	SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
	SignExt = DAG.getBitcast(VT, SignExt);
	}

	return SignExt;
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
	assert(InVT == MVT::v32i8 && "Unexpected VT!");
	return splitVectorIntUnary(Op, DAG);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT
	MVT HalfVT = VT.getHalfNumVectorElementsVT();
	SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

	unsigned NumElems = InVT.getVectorNumElements();
	SmallVector<int,8> ShufMask(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	/// Change a vector store into a pair of half-size vector stores.
	static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert((StoredVal.getValueType().is256BitVector() \|\|
	StoredVal.getValueType().is512BitVector()) &&
	"Expecting 256/512-bit op");

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. Assume the input store is legal (this transform is
	// only used for targets with AVX). Note: It is possible that we have an
	// illegal type like v2i128, and so we could allow splitting a volatile store
	// in that case if that is important.
	if (!Store->isSimple())
	return SDValue();

	SDLoc DL(Store);
	SDValue Value0, Value1;
	std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
	unsigned HalfOffset = Value0.getValueType().getStoreSize();
	SDValue Ptr0 = Store->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
	SDValue Ch0 =
	DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
	Store->getOriginalAlign(),
	Store->getMemOperand()->getFlags());
	SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
	Store->getPointerInfo().getWithOffset(HalfOffset),
	Store->getOriginalAlign(),
	Store->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
	}

	/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
	/// type.
	static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
	SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert(StoreVT.is128BitVector() &&
	StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
	StoredVal = DAG.getBitcast(StoreVT, StoredVal);

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (!Store->isSimple())
	return SDValue();

	MVT StoreSVT = StoreVT.getScalarType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned ScalarSize = StoreSVT.getStoreSize();

	SDLoc DL(Store);
	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Offset = i * ScalarSize;
	SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
	DAG.getIntPtrConstant(i, DL));
	SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
	Store->getPointerInfo().getWithOffset(Offset),
	Store->getOriginalAlign(),
	Store->getMemOperand()->getFlags());
	Stores.push_back(Ch);
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
	}

	static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
	if (StoredVal.getValueType().isVector() &&
	StoredVal.getValueType().getVectorElementType() == MVT::i1) {
	assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
	"Unexpected VT");
	assert(!St->isTruncatingStore() && "Expected non-truncating store");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), StoredVal,
	DAG.getIntPtrConstant(0, dl));
	StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
	StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	if (St->isTruncatingStore())
	return SDValue();

	// If this is a 256-bit store of concatenated ops, we are better off splitting
	// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
	// and each half can execute independently. Some cores would split the op into
	// halves anyway, so the concat (vinsertf128) is purely an extra op.
	MVT StoreVT = StoredVal.getSimpleValueType();
	if (StoreVT.is256BitVector() \|\|
	((StoreVT == MVT::v32i16 \|\| StoreVT == MVT::v64i8) &&
	!Subtarget.hasBWI())) {
	SmallVector<SDValue, 4> CatOps;
	if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
	return splitVectorStore(St, DAG);
	return SDValue();
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
	"Unexpected VT");
	assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
	TargetLowering::TypeWidenVector && "Unexpected type action!");

	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
	DAG.getUNDEF(StoreVT));

	if (Subtarget.hasSSE2()) {
	// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
	// and store it.
	MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
	MVT CastVT = MVT::getVectorVT(StVT, 2);
	StoredVal = DAG.getBitcast(CastVT, StoredVal);
	StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
	DAG.getIntPtrConstant(0, dl));

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
	St->getMemOperand());
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector loads.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
	if (RegVT.getVectorElementType() == MVT::i1) {
	assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
	assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());

	// Replace chain users with the new chain.
	assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");

	SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
	DAG.getBitcast(MVT::v16i1, Val),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
	}

	return SDValue();
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);

	if (Cond.getOpcode() == ISD::SETCC &&
	Cond.getOperand(0).getValueType() != MVT::f128) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Special case for
	// setcc([su]{add,sub,mul}o == 0)
	// setcc([su]{add,sub,mul}o != 1)
	if (ISD::isOverflowIntrOpRes(LHS) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	(isNullConstant(RHS) \|\| isOneConstant(RHS))) {
	SDValue Value, Overflow;
	X86::CondCode X86Cond;
	std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

	if ((CC == ISD::SETEQ) == isNullConstant(RHS))
	X86Cond = X86::GetOppositeBranchCondition(X86Cond);

	SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getSimpleValueType().isInteger()) {
	SDValue CCVal;
	SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	EFLAGS);
	}

	if (CC == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp =
	DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
	SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
	CCVal, Cmp);
	CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}
	}
	} else if (CC == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
	SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
	Chain =
	DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
	CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	} else {
	X86::CondCode X86Cond =
	TranslateX86CC(CC, dl, /IsFP/ true, LHS, RHS, DAG);
	SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
	SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}
	}

	if (ISD::isOverflowIntrOpRes(Cond)) {
	SDValue Value, Overflow;
	X86::CondCode X86Cond;
	std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	EVT CondVT = Cond.getValueType();

	// Add an AND with 1 if we don't already have one.
	if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
	Cond =
	DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

	SDValue LHS = Cond;
	SDValue RHS = DAG.getConstant(0, dl, CondVT);

	SDValue CCVal;
	SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
	return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	EFLAGS);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbeCall = hasStackProbeSymbol(MF);
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbeCall;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	MaybeAlign Alignment(Op.getConstantOperandVal(2));
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const Align StackAlign = TFI.getStackAlign();
	if (hasInlineStackProbe(MF)) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	}
	if (Alignment && *Alignment > StackAlign)
	Result =
	DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Alignment) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
	if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else {
	assert(ArgVT.isInteger() && ArgSize <= 32 /bytes/ &&
	"Unhandled argument type in LowerVAARG");
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
	/Align=/None, MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
	Align(8), /isVolatile/ false, false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
	static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
	switch (Opc) {
	case ISD::SHL:
	case X86ISD::VSHL:
	case X86ISD::VSHLI:
	return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
	case ISD::SRL:
	case X86ISD::VSRL:
	case X86ISD::VSRLI:
	return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
	case ISD::SRA:
	case X86ISD::VSRA:
	case X86ISD::VSRAI:
	return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
	}
	llvm_unreachable("Unknown target vector shift node");
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();

	switch (Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// Must produce 0s in the correct bits.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// Must produce 0s in the correct bits.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	// All shifted in bits must be the same so use 0.
	Elts.push_back(DAG.getConstant(0, dl, ElementType));
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version.
	Opc = getTargetVShiftUniformOpcode(Opc, true);

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +====================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +====================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| No \| byte-shift-in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +====================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 \|\|
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
	ShAmt = ShAmt.getOperand(0);
	MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
	if (Subtarget.hasSSE41())
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	else {
	SDValue ByteShift = DAG.getTargetConstant(
	(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
	ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
	ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	}
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
	DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
	SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
	DAG.getBitcast(MVT::v8i1, Mask),
	DAG.getIntPtrConstant(0, dl));
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_SAE \|\|
	Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

	return false;
	};
	auto isRoundModeSAE = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	unsigned RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	// As a convenience we allow no other bits or explicitly
	// current direction.
	return RC == 0 \|\| RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
	}
	}

	return false;
	};
	auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT \|\|
	RC == X86::STATIC_ROUNDING::TO_NEG_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_POS_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_ZERO;
	}
	}

	return false;
	};

	SDLoc dl(Op);
	unsigned IntNo = Op.getConstantOperandVal(0);
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP: {
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(2);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1),
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1));
	}
	case INTR_TYPE_1OP_SAE: {
	SDValue Sae = Op.getOperand(2);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_2OP: {
	SDValue Src2 = Op.getOperand(2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(3);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1), Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Src2);
	}
	case INTR_TYPE_2OP_SAE: {
	SDValue Sae = Op.getOperand(3);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	case INTR_TYPE_3OP:
	case INTR_TYPE_3OP_IMM8: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src1, Src2, Src3,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	{Src1, Src2, Src3});
	}
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RC Opcode is specified and
	// - RC is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getVectorMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, PassThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getVectorMaskingNode(
	DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
	Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK_SAE: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Rnd = Op.getOperand(4);

	unsigned Opc;
	if (isRoundModeCurDirection(Rnd))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Rnd))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
	Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getScalarMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, passThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	unsigned Opc = IntrData->Opc0;
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrWithRoundingModeOpcode;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RND: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Rnd = Op.getOperand(5);

	SDValue NewOp;
	unsigned RC = 0;
	if (isRoundModeCurDirection(Rnd))
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	else if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else
	return SDValue();

	return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Sae = Op.getOperand(5);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue NewOp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	if (!NewOp)
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case BLENDV: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
	Src3 = DAG.getBitcast(MaskVT, Src3);

	// Reverse the operands to match VSELECT order.
	return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
	}
	case VPERM_2OP : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);

	// Swap Src1 and Src2 in the node creation
	return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
	}
	case IFMA_OP:
	// NOTE: We need to swizzle the operands to pass the multiply operands
	// first.
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	FPclassMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}

	case CMP_MASK_CC: {
	MVT MaskVT = Op.getSimpleValueType();
	SDValue CC = Op.getOperand(3);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(4);
	if (isRoundModeSAE(Sae))
	return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Sae);
	if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	return DAG.getNode(IntrData->Opc0, dl, MaskVT,
	{Op.getOperand(1), Op.getOperand(2), CC});
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	if (!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	CmpMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	// Some conditions require the operands to be swapped.
	if (CC == ISD::SETLT \|\| CC == ISD::SETLE)
	std::swap(LHS, RHS);

	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = Op.getConstantOperandVal(3);
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getTargetConstant(CondVal, dl, MVT::i8));
	else if (isRoundModeSAE(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
	DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
	else
	return SDValue();
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getConstant(0, dl, MVT::v16i1),
	FCmp, DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
	DAG.getBitcast(MVT::i16, Ins));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
	return Op.getOperand(1);

	// Avoid false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, VT);

	return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
	Mask);
	}
	case FIXUPIMM:
	case FIXUPIMM_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM)
	? Src1
	: getZeroVector(VT, Subtarget, DAG, dl);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

	if (Opc == X86ISD::VFIXUPIMM \|\| Opc == X86ISD::VFIXUPIMM_SAE)
	return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

	return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	auto Round = cast<ConstantSDNode>(Op.getOperand(2));
	SDValue RoundingMode =
	DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	auto Round = cast<ConstantSDNode>(Op.getOperand(3));
	SDValue RoundingMode =
	DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	case BEXTRI: {
	assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");

	// The control is a TargetConstant, but we need to convert it to a
	// ConstantSDNode.
	uint64_t Imm = Op.getConstantOperandVal(2);
	SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Control);
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

	SDValue Res;
	// If the carry in is zero, then we should just use ADD/SUB instead of
	// ADC/SBB.
	if (isNullConstant(Op.getOperand(1))) {
	Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
	Op.getOperand(3));
	} else {
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
	DAG.getConstant(-1, dl, MVT::i8));
	Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
	Op.getOperand(3), GenCF.getValue(1));
	}
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Res };
	return DAG.getMergeValues(Results, dl);
	}
	case CVTPD2PS_MASK:
	case CVTPD2DQ_MASK:
	case CVTQQ2PS_MASK:
	case TRUNCATE_TO_REG: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
	{Src, PassThru, Mask});
	}
	case CVTPS2PH_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue Rnd = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
	PassThru, Mask);

	}
	case CVTNEPS2BF16_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (ISD::isBuildVectorAllOnes(Mask.getNode()))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	// Break false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	unsigned TestOpc = X86ISD::PTEST;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	// CF = 1
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
	SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::x86_sse42_pcmpistrm128:
	case Intrinsic::x86_sse42_pcmpestrm128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::eh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else { // Handles the SP or FP case.
	bool CantUseFP = RegInfo->needsStackRealignment(MF);
	if (CantUseFP)
	Reg = RegInfo->getPtrSizedStackRegister(MF);
	else
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	}
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}

	case Intrinsic::x86_avx512_vp2intersect_q_512:
	case Intrinsic::x86_avx512_vp2intersect_q_256:
	case Intrinsic::x86_avx512_vp2intersect_q_128:
	case Intrinsic::x86_avx512_vp2intersect_d_512:
	case Intrinsic::x86_avx512_vp2intersect_d_256:
	case Intrinsic::x86_avx512_vp2intersect_d_128: {
	MVT MaskVT = Op.getSimpleValueType();

	SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
	SDLoc DL(Op);

	SDValue Operation =
	DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
	Op->getOperand(1), Op->getOperand(2));

	SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
	MaskVT, Operation);
	SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
	MaskVT, Operation);
	return DAG.getMergeValues({Result0, Result1}, DL);
	}
	case Intrinsic::x86_mmx_pslli_w:
	case Intrinsic::x86_mmx_pslli_d:
	case Intrinsic::x86_mmx_pslli_q:
	case Intrinsic::x86_mmx_psrli_w:
	case Intrinsic::x86_mmx_psrli_d:
	case Intrinsic::x86_mmx_psrli_q:
	case Intrinsic::x86_mmx_psrai_w:
	case Intrinsic::x86_mmx_psrai_d: {
	SDLoc DL(Op);
	SDValue ShAmt = Op.getOperand(2);
	// If the argument is a constant, convert it to a target constant.
	if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
	// Clamp out of bounds shift amounts since they will otherwise be masked
	// to 8-bits which may make it no longer out of bounds.
	unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
	if (ShiftAmount == 0)
	return Op.getOperand(1);

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	Op.getOperand(0), Op.getOperand(1),
	DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
	}

	unsigned NewIntrinsic;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_mmx_pslli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psll_w;
	break;
	case Intrinsic::x86_mmx_pslli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psll_d;
	break;
	case Intrinsic::x86_mmx_pslli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psll_q;
	break;
	case Intrinsic::x86_mmx_psrli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
	break;
	case Intrinsic::x86_mmx_psrli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
	break;
	case Intrinsic::x86_mmx_psrli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
	break;
	case Intrinsic::x86_mmx_psrai_w:
	NewIntrinsic = Intrinsic::x86_mmx_psra_w;
	break;
	case Intrinsic::x86_mmx_psrai_d:
	NewIntrinsic = Intrinsic::x86_mmx_psra_d;
	break;
	}

	// The vector shift intrinsics with scalars uses 32b shift amounts but
	// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
	// MMX register.
	ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(NewIntrinsic, DL, MVT::i32),
	Op.getOperand(1), ShAmt);

	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	// Cast mask to an integer type.
	Mask = DAG.getBitcast(MaskVT, Mask);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res =
	DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
	MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
	}

	static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	VT.getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the gather intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res =
	DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
	MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	Src.getSimpleValueType().getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the scatter intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
	SDValue Res =
	DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
	MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return Res;
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsics with chain that return their
	/// value into registers EDX:EAX.
	/// If operand ScrReg is a valid register identifier, then operand 2 of N is
	/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
	/// TargetOpcode.
	/// Returns a Glue value which can be used to add extra copy-from-reg if the
	/// expanded intrinsics implicitly defines extra registers (i.e. not just
	/// EDX:EAX).
	static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	unsigned TargetOpcode,
	unsigned SrcReg,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDValue Chain = N->getOperand(0);
	SDValue Glue;

	if (SrcReg) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
	Glue = Chain.getValue(1);
	}

	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue N1Ops[] = {Chain, Glue};
	SDNode *N1 = DAG.getMachineNode(
	TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	SDValue LO, HI;
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);
	Glue = HI.getValue(2);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return Glue;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	return Glue;
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
	/* NoRegister */0, Subtarget,
	Results);
	if (Opcode != X86::RDTSCP)
	return;

	SDValue Chain = Results[1];
	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
	Results[1] = ecx;
	Results.push_back(ecx.getValue(1));
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 3> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {
	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
	return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {
	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Val, Ptr, Mask };
	unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
	return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = Op.getConstantOperandVal(1);
	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_rdpkru: {
	SDLoc dl(Op);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	// Create a RDPKRU node and pass 0 to the ECX parameter.
	return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_wrpkru: {
	SDLoc dl(Op);
	// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
	// to the EDX and ECX parameters.
	return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
	Op.getOperand(0), Op.getOperand(2),
	DAG.getConstant(0, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during FinalizeISel in EmitInstrWithCustomInserter.
	return Op;
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	case Intrinsic::x86_umwait:
	case Intrinsic::x86_tpause: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;

	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic");
	case Intrinsic::x86_umwait:
	Opcode = X86ISD::UMWAIT;
	break;
	case Intrinsic::x86_tpause:
	Opcode = X86ISD::TPAUSE;
	break;
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	Opcode = X86ISD::LWPINS;
	break;
	}

	SDValue Operation =
	DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	case Intrinsic::x86_enqcmd:
	case Intrinsic::x86_enqcmds: {
	SDLoc dl(Op);
	SDValue Chain = Op.getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic!");
	case Intrinsic::x86_enqcmd:
	Opcode = X86ISD::ENQCMD;
	break;
	case Intrinsic::x86_enqcmds:
	Opcode = X86ISD::ENQCMDS;
	break;
	}
	SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
	Op.getOperand(3));
	SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1)};
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	const APInt &HintVal = Op.getConstantOperandAPInt(6);
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC:
	// GetExtended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;

	// RDPMC uses ECX to select the index of the performance counter to read.
	// XGETBV uses ECX to select the index of the XCR register to return.
	// The result is stored into registers EDX:EAX.
	expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
	Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Offset = DAG.getUNDEF(VMask.getValueType());

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
	MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
	true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = Op.getConstantOperandVal(0);
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /SPOffset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = Op.getConstantOperandVal(0);
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
	const MachineFunction &MF) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	Register Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	Register X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	Register X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	case CallingConv::Tail:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we use a packed lookup table of the four 2-bit
	values that we can index by FPSP[11:10]
	0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

	(0x2d >> ((FPSR & 0xc00) >> 9)) & 3
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	SDValue Chain = Op.getOperand(0);
	SDValue Ops[] = {Chain, StackSlot};
	Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
	Align(2), MachineMemOperand::MOStore);

	// Load FP Control Word from stack slot
	SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
	Chain = CWD.getValue(1);

	// Mask and turn the control bits into a shift for the lookup table.
	SDValue Shift =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));
	Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

	SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
	DAG.getConstant(3, DL, MVT::i32));

	RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

	return DAG.getMergeValues({RetVal, Chain}, DL);
	}

	/// Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (NumElems > 16 \|\|
	(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
	return splitVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = DAG.getConstant(0, DL, CurrVT);

	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = Op0;
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI() &&
	// vXi8 vectors need to be promoted to 512-bits for vXi32.
	(Subtarget.canExtendTo512DQ() \|\| VT.getVectorElementType() != MVT::i8))
	return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);

	assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
	DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32)
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return splitVectorIntBinary(Op, DAG);
	}

	static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	if (VT.getScalarType() == MVT::i1) {
	SDLoc dl(Op);
	switch (Opcode) {
	default: llvm_unreachable("Expected saturated arithmetic opcode");
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	// *addsat i1 X, Y --> X \| Y
	return DAG.getNode(ISD::OR, dl, VT, X, Y);
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	// *subsat i1 X, Y --> X & ~Y
	return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
	}
	}

	if (VT.is128BitVector()) {
	// Avoid the generic expansion with min/max if we don't have pminu/pmaxu.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), VT);
	SDLoc DL(Op);
	if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
	// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
	}
	if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
	// usubsat X, Y --> (X >u Y) ? X - Y : 0
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
	}
	// Use default expansion.
	return SDValue();
	}

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return splitVectorIntBinary(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && Subtarget.hasSSE41()) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Sub =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
	return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
	}

	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	assert(VT.isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return splitVectorIntUnary(Op, DAG);
	}

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	// Default to expand.
	return SDValue();
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// For AVX1 cases, split to use legal ops (everything but v4i64).
	if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
	return splitVectorIntBinary(Op, DAG);

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	SDLoc DL(Op);
	unsigned Opcode = Op.getOpcode();
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
	// using the SMIN/SMAX instructions and flipping the signbit back.
	if (VT == MVT::v8i16) {
	assert((Opcode == ISD::UMIN \|\| Opcode == ISD::UMAX) &&
	"Unexpected MIN/MAX opcode");
	SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
	N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
	N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
	Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
	SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
	return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
	}

	// Else, expand to a compare/select.
	ISD::CondCode CC;
	switch (Opcode) {
	case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
	case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
	case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
	case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
	default: llvm_unreachable("Unknown MINMAX opcode");
	}

	SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
	return DAG.getSelect(DL, VT, Cond, N0, N1);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntBinary(Op, DAG);

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return splitVectorIntBinary(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
	}

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Extract the lo/hi parts to any extend to i16.
	// We're going to mask off the low byte of each result element of the
	// pmullw, so it doesn't matter what's in the high byte of each 16-bit
	// element.
	SDValue Undef = DAG.getUNDEF(VT);
	SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
	SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
	MVT::i16));
	HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
	MVT::i16));
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, A),
	DAG.getBitcast(MVT::v2i64, B));
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Aodds),
	DAG.getBitcast(MVT::v2i64, Bodds));

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");
	assert(!Subtarget.hasDQI() && "DQI should use MULLQ");

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	KnownBits AKnown = DAG.computeKnownBits(A);
	KnownBits BKnown = DAG.computeKnownBits(B);

	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
	bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
	bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

	SDValue Zero = DAG.getConstant(0, dl, VT);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	bool IsSigned = Op->getOpcode() == ISD::MULHS;
	unsigned NumElts = VT.getVectorNumElements();
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntBinary(Op, DAG);

	if ((VT == MVT::v32i16 \|\| VT == MVT::v64i8) && !Subtarget.hasBWI())
	return splitVectorIntBinary(Op, DAG);

	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32) {
	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
	9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	unsigned Opcode =
	(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B)));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, Odd0),
	DAG.getBitcast(MulVT, Odd1)));

	// Shuffle it back into the right order.
	SmallVector<int, 16> ShufMask(NumElts);
	for (int i = 0; i != (int)NumElts; ++i)
	ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

	SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

	// If we have a signed multiply but no PMULDQ fix up the result of an
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
	}

	return Res;
	}

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
	Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}

	// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
	// to a vXi16 type. Do the multiplies, shift the results and pack the half
	// lane results back together.

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};

	// Extract the lo parts and zero/sign extend to i16.
	// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
	// shifts to sign extend. Using unpack for unsigned only requires an xor to
	// create zeros and a copy due to tied registers contraints pre-avx. But using
	// zero_extend_vector_inreg would require an additional pshufd for the high
	// part.

	SDValue ALo, AHi;
	if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);

	AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
	AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
	} else if (IsSigned) {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));

	ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
	AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
	} else {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	}

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh and extend.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	SDValue LoOp = B.getOperand(i + j);
	SDValue HiOp = B.getOperand(i + j + 8);

	if (IsSigned) {
	LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
	} else {
	LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
	}

	LoOps.push_back(LoOp);
	HiOps.push_back(HiOp);
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);

	BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
	BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
	} else if (IsSigned) {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));

	BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
	BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to vXi8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MPI, /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SRL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	APInt APIntShiftAmt;
	if (!X86::isConstantSplat(Amt, APIntShiftAmt))
	return SDValue();

	// If the shift amount is out of range, return undef.
	if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
	return DAG.getUNDEF(VT);

	uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\| (Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v64i8)) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = DAG.getConstant(0, dl, VT);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
	return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);

	if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
	MVT EltVT = VT.getVectorElementType();
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}

	// vXi8 shifts - shift as v8i16 + mask result.
	if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) \|\|
	(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) \|\|
	VT == MVT::v64i8) &&
	!Subtarget.hasXOP()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
	if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
	unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
	unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	// Create the mask using vXi16 shifts. For shift-rights we need to move
	// the upper byte down before splatting the vXi8 mask.
	SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
	BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
	BaseShAmt, Subtarget, DAG);
	if (Opcode != ISD::SHL)
	BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
	8, DAG);
	BitMask = DAG.getBitcast(VT, BitMask);
	BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
	SmallVector<int, 64>(NumElts, 0));

	SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
	DAG.getBitcast(ExtVT, R), BaseShAmt,
	Subtarget, DAG);
	Res = DAG.getBitcast(VT, Res);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

	if (Opcode == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
	// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
	SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
	SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
	BaseShAmt, Subtarget, DAG);
	SignMask = DAG.getBitcast(VT, SignMask);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
	}
	return Res;
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	// Convert a shift/rotate left amount to a multiplication scale factor.
	static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Amt.getSimpleValueType();
	if (!(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16) \|\|
	(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
	return SDValue();

	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	return DAG.getBuildVector(VT, dl, Elts);
	}

	// If the target doesn't support variable shifts, use either FP conversion
	// or integer multiplication to avoid shifting each element individually.
	if (VT == MVT::v4i32) {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
	DAG.getConstant(0x3f800000U, dl, VT));
	Amt = DAG.getBitcast(MVT::v4f32, Amt);
	return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
	}

	// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
	if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
	SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
	Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
	Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
	if (Subtarget.hasSSE41())
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

	return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
	DAG.getBitcast(VT, Hi),
	{0, 2, 4, 6, 8, 10, 12, 14});
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	unsigned Opc = Op.getOpcode();
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Opc))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() && (VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Opc == ISD::SRL \|\| Opc == ISD::SRA) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Opc == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Opc != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Opc == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a BLENDing shuffle instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes in parallel before blending.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue Amt1, Amt2;
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ShuffleMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue A = Amt->getOperand(i);
	if (A.isUndef()) {
	ShuffleMask.push_back(SM_SentinelUndef);
	continue;
	}
	if (!Amt1 \|\| Amt1 == A) {
	ShuffleMask.push_back(i);
	Amt1 = A;
	continue;
	}
	if (!Amt2 \|\| Amt2 == A) {
	ShuffleMask.push_back(i + NumElts);
	Amt2 = A;
	continue;
	}
	break;
	}

	// Only perform this blend if we can perform it without loading a mask.
	if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
	(VT != MVT::v16i16 \|\|
	is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
	(VT == MVT::v4i32 \|\| Subtarget.hasSSE41() \|\| Opc != ISD::SHL \|\|
	canWidenShuffleElements(ShuffleMask))) {
	auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
	auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
	if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
	Cst2->getAPIntValue().ult(EltSizeInBits)) {
	SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst1->getZExtValue(), DAG);
	SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst2->getZExtValue(), DAG);
	return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
	}
	}
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	if (Opc == ISD::SHL)
	if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
	return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

	// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
	if (Opc == ISD::SRL && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
	SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
	return DAG.getSelect(dl, VT, ZAmt, R, Res);
	}
	}

	// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
	// TODO: Special case handling for shift by 0/1, really we can afford either
	// of these cases in pre-SSE41/XOP/AVX512 but not both.
	if (Opc == ISD::SRA && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
	((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
	!Subtarget.hasAVX512()) \|\|
	DAG.isKnownNeverZero(Amt))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Amt0 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
	SDValue Amt1 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
	SDValue Sra1 =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
	SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
	Res = DAG.getSelect(dl, VT, Amt0, R, Res);
	return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. On AVX we're better off
	// just zero-extending, but for SSE just duplicating the top 16-bits is
	// cheaper and has the same effect for out of range values.
	if (Subtarget.hasAVX()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	} else {
	SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
	SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{4, 5, 6, 7, -1, -1, -1, -1});
	Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{2, 3, 3, 3, -1, -1, -1, -1});
	Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{2, 3, 3, 3, -1, -1, -1, -1});
	}
	}

	unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
	SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
	SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
	SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
	SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

	// Merge the shifted lane results optimally with/without PBLENDW.
	// TODO - ideally shuffle combining would handle this.
	if (Subtarget.hasSSE41()) {
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}
	SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
	SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	// NOTE: We honor prefered vector width before promoting to 512-bits.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) \|\|
	(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Opc, dl, ExtVT, R, Amt));
	}

	// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
	// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
	if (ConstantAmt && (Opc == ISD::SRA \|\| Opc == ISD::SRL) &&
	(VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	!Subtarget.hasXOP()) {
	int NumElts = VT.getVectorNumElements();
	SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

	// Extend constant shift amount to vXi16 (it doesn't matter if the type
	// isn't legal).
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
	Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
	assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
	"Constant build vector expected");

	if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
	R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
	: DAG.getZExtOrTrunc(R, dl, ExVT);
	R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
	R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
	return DAG.getZExtOrTrunc(R, dl, VT);
	}

	SmallVector<SDValue, 16> LoAmt, HiAmt;
	for (int i = 0; i != NumElts; i += 16) {
	for (int j = 0; j != 8; ++j) {
	LoAmt.push_back(Amt.getOperand(i + j));
	HiAmt.push_back(Amt.getOperand(i + j + 8));
	}
	}

	MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
	SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
	SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

	SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
	SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
	LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
	LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
	HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
	LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
	ISD::SETGT);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we can use PBLENDVB which selects bytes based just
	// on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT,
	DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, dl, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
	Amt = DAG.getBitcast(VT, Amt);

	if (Opc == ISD::SHL \|\| Opc == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Opc == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
	SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte meaning that we can safely pack with PACKUSWB.
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
	SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
	SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
	SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we can use PBLENDVB which selects bytes based just on
	// the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
	} else {
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into 128-bit shifts.
	if (VT.is256BitVector())
	return splitVectorIntBinary(Op, DAG);

	if (VT == MVT::v32i16 \|\| VT == MVT::v64i8)
	return splitVectorIntBinary(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.isVector() && "Custom lowering only for vector rotates!");

	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	int NumElts = VT.getVectorNumElements();

	// Check for constant splat rotation amount.
	APInt CstSplatValue;
	bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

	// Check for splat rotate by zero.
	if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
	return R;

	// AVX512 implicitly uses modulo rotation amounts.
	if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
	// Attempt to rotate by immediate.
	if (IsCstSplat) {
	unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
	return DAG.getNode(RotOpc, DL, VT, R,
	DAG.getTargetConstant(RotAmt, DL, MVT::i8));
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
	// XOP implicitly uses modulo rotation amounts.
	if (Subtarget.hasXOP()) {
	if (VT.is256BitVector())
	return splitVectorIntBinary(Op, DAG);
	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (IsCstSplat) {
	uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getTargetConstant(RotAmt, DL, MVT::i8));
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	// Split 256-bit integers on pre-AVX2 targets.
	if (VT.is256BitVector() && !Subtarget.hasAVX2())
	return splitVectorIntBinary(Op, DAG);

	assert((VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v16i8 \|\|
	((VT == MVT::v8i32 \|\| VT == MVT::v16i16 \|\| VT == MVT::v32i8) &&
	Subtarget.hasAVX2())) &&
	"Only vXi32/vXi16/vXi8 vector rotates supported");

	// Rotate by an uniform constant - expand back to shifts.
	if (IsCstSplat)
	return SDValue();

	bool IsSplatAmt = DAG.isSplatValue(Amt);

	// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
	// the amount bit.
	if (EltSizeInBits == 8 && !IsSplatAmt) {
	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
	return SDValue();

	// We don't need ModuloAmt here as we just peek at individual bits.
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (Subtarget.hasSSE41()) {
	// On SSE41 targets we can use PBLENDVB which selects bytes based just
	// on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT,
	DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, DL, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
	return DAG.getSelect(DL, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	// r = VSELECT(r, rot(r, 4), a);
	SDValue M;
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// r = VSELECT(r, rot(r, 2), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// return VSELECT(r, rot(r, 1), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
	return SignBitSelect(VT, Amt, M, R);
	}

	// ISD::ROT* uses modulo rotate amounts.
	Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));

	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
	bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
	SupportedVectorVarShift(VT, Subtarget, ISD::SRL);

	// Fallback for splats + all supported variable shifts.
	// Fallback for non-constants AVX2 vXi16 as well.
	if (IsSplatAmt \|\| LegalVarShifts \|\| (Subtarget.hasAVX2() && !ConstantAmt)) {
	SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
	AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
	SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
	SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
	return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
	}

	// As with shifts, convert the rotation amount to a multiplication factor.
	SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
	assert(Scale && "Failed to convert ROTL amount to scale");

	// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
	if (EltSizeInBits == 16) {
	SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
	SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
	// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
	// that can then be OR'd with the lower 32-bits.
	assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
	static const int OddMask[] = {1, -1, 3, -1};
	SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
	SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

	SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R),
	DAG.getBitcast(MVT::v2i64, Scale));
	SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R13),
	DAG.getBitcast(MVT::v2i64, Scale13));
	Res02 = DAG.getBitcast(VT, Res02);
	Res13 = DAG.getBitcast(VT, Res13);

	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
	if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();

	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	Type *MemType = SI->getValueOperand()->getType();

	bool NoImplicitFloatOps =
	SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE1() \|\| Subtarget.hasX87()))
	return false;

	return needsCmpXchgNb(MemType);
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	Type *MemType = LI->getType();

	// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
	// can use movq to do the load. If we have X87 we can load into an 80-bit
	// X87 register and store it to a stack temporary.
	bool NoImplicitFloatOps =
	LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE1() \|\| Subtarget.hasX87()))
	return AtomicExpansionKind::None;

	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	case AtomicRMWInst::FAdd:
	case AtomicRMWInst::FSub:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	// If this is a canonical idempotent atomicrmw w/no uses, we have a better
	// lowering available in lowerAtomicArith.
	// TODO: push more cases through this path.
	if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
	if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
	AI->use_empty())
	return nullptr;

	IRBuilder<> Builder(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded =
	Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
	Align(AI->getType()->getPrimitiveSizeInBits()));
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
	if (!SI.isUnordered())
	return false;
	return ExperimentalUnorderedISEL;
	}
	bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
	if (!LI.isUnordered())
	return false;
	return ExperimentalUnorderedISEL;
	}


	/// Emit a locked operation on a stack location which does not change any
	/// memory location, but does involve a lock prefix. Location is chosen to be
	/// a) very likely accessed only by a single thread to minimize cache traffic,
	/// and b) definitely dereferenceable. Returns the new Chain result.
	static SDValue emitLockedStackOp(SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue Chain, SDLoc DL) {
	// Implementation notes:
	// 1) LOCK prefix creates a full read/write reordering barrier for memory
	// operations issued by the current processor. As such, the location
	// referenced is not relevant for the ordering properties of the instruction.
	// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
	// 2) Using an immediate operand appears to be the best encoding choice
	// here since it doesn't require an extra register.
	// 3) OR appears to be very slightly faster than ADD. (Though, the difference
	// is small enough it might just be measurement noise.)
	// 4) When choosing offsets, there are several contributing factors:
	// a) If there's no redzone, we default to TOS. (We could allocate a cache
	// line aligned stack object to improve this case.)
	// b) To minimize our chances of introducing a false dependence, we prefer
	// to offset the stack usage from TOS slightly.
	// c) To minimize concerns about cross thread stack usage - in particular,
	// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
	// captures state in the TOS frame and accesses it from many threads -
	// we want to use an offset such that the offset is in a distinct cache
	// line from the TOS frame.
	//
	// For a general discussion of the tradeoffs and benchmark results, see:
	// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

	auto &MF = DAG.getMachineFunction();
	auto &TFL = *Subtarget.getFrameLowering();
	const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

	if (Subtarget.is64Bit()) {
	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::RSP, MVT::i64), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i64), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering =
	static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
	SyncScope::ID FenceSSID =
	static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	return emitLockedStackOp(DAG, Subtarget, Chain, dl);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
	cpOut, Success, EFLAGS.getValue(1));
	}

	// Create MOVMSKB, taking into account whether we need to split for AVX1.
	static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT InVT = V.getSimpleValueType();

	if (InVT == MVT::v64i8) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
	Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
	DAG.getConstant(32, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
	}
	if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	}

	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
	// half to v32i1 and concatenating the result.
	if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	assert(Subtarget.hasBWI() && "Expected BWI target");
	SDLoc dl(Op);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(1, dl));
	Hi = DAG.getBitcast(MVT::v32i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	}

	// Use MOVMSK for vector to scalar conversion to prevent scalarization.
	if ((SrcVT == MVT::v16i1 \|\| SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
	assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
	MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
	SDLoc DL(Op);
	SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	return DAG.getZExtOrTrunc(V, DL, DstVT);
	}

	assert((SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) && "Unexpected VT!");

	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
	!(DstVT == MVT::x86mmx && SrcVT.isVector()))
	// This conversion needs to be expanded.
	return SDValue();

	SDLoc dl(Op);
	if (SrcVT.isVector()) {
	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
	SrcVT.getVectorNumElements() * 2);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
	DAG.getUNDEF(SrcVT));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
	}

	MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
	Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

	if (DstVT == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = DAG.getConstant(0, DL, VT);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
	SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	int NumElts = VT.getVectorNumElements();
	(void)EltVT;
	assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, VT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, VT);
	SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

	// Low nibbles
	SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
	SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
	return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems < 16 \|\| (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	// For element types greater than i8, do vXi8 pop counts and a bytesum.
	if (VT.getScalarType() != MVT::i8) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
	SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
	return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
	}

	// We can't use the fast LUT approach, so fall back on LegalizeDAG.
	if (!Subtarget.hasSSSE3())
	return SDValue();

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return splitVectorIntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// Split v64i8 without BWI so that we can still use the PSHUFB lowering.
	if (VT == MVT::v64i8 && !Subtarget.hasBWI())
	return splitVectorIntUnary(Op, DAG);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return splitVectorIntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	// Specialized lowering for the canonical form of an idemptotent atomicrmw.
	// The core idea here is that since the memory location isn't actually
	// changing, all we need is a lowering for the ordering impacts of the
	// atomicrmw. As such, we can chose a different operation and memory
	// location to minimize impact on other code.
	if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
	// On X86, the only ordering which actually requires an instruction is
	// seq_cst which isn't SingleThread, everything just needs to be preserved
	// during codegen and then dropped. Note that we expect (but don't assume),
	// that orderings other than seq_cst and acq_rel have been canonicalized to
	// a store or load.
	if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
	AN->getSyncScopeID() == SyncScope::System) {
	// Prefer a locked operation against a stack location to minimize cache
	// traffic. This assumes that stack locations are very likely to be
	// accessed only by the owning thread.
	SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}
	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), LockOp.getValue(1));
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDLoc dl(Node);
	EVT VT = Node->getMemoryVT();

	bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
	bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

	// If this store is not sequentially consistent and the type is legal
	// we can just keep it.
	if (!IsSeqCst && IsTypeLegal)
	return Op;

	if (VT == MVT::i64 && !IsTypeLegal) {
	// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
	// is enabled.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	SDValue Chain;
	if (Subtarget.hasSSE1()) {
	SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Node->getOperand(2));
	MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
	SclToVec = DAG.getBitcast(StVT, SclToVec);
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
	Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	} else if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register using a stack temporary.
	// This will put the whole integer into the significand.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	Chain =
	DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
	MPI, /Align/ 0, MachineMemOperand::MOStore);
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue LdOps[] = {Chain, StackPtr};
	SDValue Value =
	DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
	/Align/ None, MachineMemOperand::MOLoad);
	Chain = Value.getValue(1);

	// Now use an FIST to do the atomic store.
	SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
	Chain =
	DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
	StoreOps, MVT::i64, Node->getMemOperand());
	}

	if (Chain) {
	// If this is a sequentially consistent store, also emit an appropriate
	// barrier.
	if (IsSeqCst)
	Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

	return Chain;
	}
	}
	}

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	Node->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	Node->getMemOperand());
	return Swap.getValue(1);
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getAllOnesConstant(DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)FixedVectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Scale = N->getScale();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();

	if (VT == MVT::v2f32 \|\| VT == MVT::v2i32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
	N->getMemoryVT(), N->getMemOperand());
	}
	return SDValue();
	}

	MVT IndexVT = Index.getSimpleValueType();

	// If the index is v2i32, we're being called by type legalization and we
	// should just let the default handling take care of it.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());
	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	Src = ExtendToType(Src, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
	N->getMemoryVT(), N->getMemOperand());
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	MVT MaskVT = Mask.getSimpleValueType();
	SDValue PassThru = N->getPassThru();
	SDLoc dl(Op);

	// Handle AVX masked loads which don't support passthru other than 0.
	if (MaskVT.getVectorElementType() != MVT::i1) {
	// We also allow undef in the isel pattern.
	if (PassThru.isUndef() \|\| ISD::isBuildVectorAllZeros(PassThru.getNode()))
	return Op;

	SDValue NewLoad = DAG.getMaskedLoad(
	VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
	N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
	N->isExpandingLoad());
	// Emit a blend.
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
	return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
	}

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	PassThru = ExtendToType(PassThru, WideDataVT, DAG);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	SDValue NewLoad = DAG.getMaskedLoad(
	WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
	PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
	N->getExtensionType(), N->isExpandingLoad());

	SDValue Extract =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	N->getOffset(), Mask, N->getMemoryVT(),
	N->getMemOperand(), N->getAddressingMode(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue PassThru = N->getPassThru();
	MVT IndexVT = Index.getSimpleValueType();

	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	MVT OrigVT = VT;
	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!IndexVT.is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());

	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	PassThru = ExtendToType(PassThru, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
	N->getScale() };
	SDValue NewGather = DAG.getMemIntrinsicNode(
	X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
	NewGather, DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
	}

	static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	MVT DstVT = Op.getSimpleValueType();

	AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
	unsigned SrcAS = N->getSrcAddressSpace();

	assert(SrcAS != N->getDestAddressSpace() &&
	"addrspacecast must be between different address spaces");

	if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
	} else if (DstVT == MVT::i64) {
	Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
	} else if (DstVT == MVT::i32) {
	Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
	} else {
	report_fatal_error("Bad address space in addrspacecast");
	}
	return Op;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {

	bool IsStrict = Op->isStrictFPOpcode();
	unsigned Offset = IsStrict ? 1 : 0;
	SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());

	SDLoc dl(Op);
	SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
	MakeLibCallOptions CallOptions;
	std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
	CallOptions, dl, Chain);

	if (IsStrict)
	return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

	return Tmp.first;
	}

	// Custom split CVTPS2PH with wide types.
	static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	SDValue RC = Op.getOperand(1);
	Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
	Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::FSHL:
	case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
	case ISD::STRICT_SINT_TO_FP:
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::STRICT_UINT_TO_FP:
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND:
	case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::FP_ROUND:
	case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
	case ISD::FP16_TO_FP:
	case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
	case ISD::FP_TO_FP16:
	case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
	case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
	case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
	case ISD::FADD:
	case ISD::FSUB: return lowerFaddFsub(Op, DAG);
	case ISD::FROUND: return LowerFROUND(Op, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::LRINT:
	case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
	case ISD::SETCC:
	case ISD::STRICT_FSETCC:
	case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
	case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
	case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	// If the original node has one result, take the return value from
	// LowerOperation as is. It might not be result number 0.
	if (N->getNumValues() == 1) {
	Results.push_back(Res);
	return;
	}

	// If the original node has multiple results, then the return node should
	// have the same number of results.
	assert((N->getNumValues() == Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ReplaceNodeResults: ";
	N->dump(&DAG);
	#endif
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::CVTPH2PS: {
	EVT VT = N->getValueType(0);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
	Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}
	case X86ISD::STRICT_CVTPH2PS: {
	EVT VT = N->getValueType(0);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
	{N->getOperand(0), Lo});
	Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
	{N->getOperand(0), Hi});
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Lo.getValue(1), Hi.getValue(1));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	case ISD::CTPOP: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	// Use a v2i64 if possible.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
	SDValue Wide =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
	Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
	// Bit count should fit in 32-bits, extract it as that and then zero
	// extend to i64. Otherwise we end up extracting bits 63:32 separately.
	Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
	Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
	DAG.getIntPtrConstant(0, dl));
	Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
	Results.push_back(Wide);
	}
	return;
	}
	case ISD::MUL: {
	EVT VT = N->getValueType(0);
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
	// Pre-promote these to vXi16 to avoid op legalization thinking all 16
	// elements are needed.
	MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
	SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	unsigned NumConcats = 16 / VT.getVectorNumElements();
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
	Results.push_back(Res);
	return;
	}
	case X86ISD::VPMADDWD:
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG/VPMADDWD by widening.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	EVT VT = N->getValueType(0);
	EVT InVT = N->getOperand(0).getValueType();
	assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
	"Expected a VT that divides into 128 bits.");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	unsigned NumConcat = 128 / InVT.getSizeInBits();

	EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	NumConcat * InVT.getVectorNumElements());
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	NumConcat * VT.getVectorNumElements());

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

	SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
	Results.push_back(Res);
	return;
	}
	case ISD::ABS: {
	assert(N->getValueType(0) == MVT::i64 &&
	"Unexpected type (!= i64) on ABS.");
	MVT HalfT = MVT::i32;
	SDValue Lo, Hi, Tmp;
	SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);

	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(0, dl, HalfT));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(1, dl, HalfT));
	Tmp = DAG.getNode(
	ISD::SRA, dl, HalfT, Hi,
	DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
	Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
	Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
	SDValue(Lo.getNode(), 1));
	Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
	Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	EVT VT = N->getValueType(0);
	if (VT.isVector()) {
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	// If this RHS is a constant splat vector we can widen this and let
	// division/remainder by constant optimize it.
	// TODO: Can we do something for non-splat?
	APInt SplatVal;
	if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
	unsigned NumConcats = 128 / VT.getSizeInBits();
	SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
	Ops0[0] = N->getOperand(0);
	EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
	SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
	SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
	Results.push_back(Res);
	}
	return;
	}

	LLVM_FALLTHROUGH;
	}
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::TRUNCATE: {
	MVT VT = N->getSimpleValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// The generic legalizer will try to widen the input type to the same
	// number of elements as the widened result type. But this isn't always
	// the best thing so do some custom legalization to avoid some cases.
	MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	unsigned InBits = InVT.getSizeInBits();
	if (128 % InBits == 0) {
	// 128 bit and smaller inputs should avoid truncate all together and
	// just use a build_vector that will become a shuffle.
	// TODO: Widen and use a shuffle directly?
	MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
	EVT EltVT = VT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = VT.getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
	DAG.getIntPtrConstant(i, dl));
	Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
	}
	Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
	return;
	}
	// With AVX512 there are some cases that can use a target specific
	// truncate node to go from 256/512 to less than 128 with zeros in the
	// upper elements of the 128 bit result.
	if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
	// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
	if ((InBits == 256 && Subtarget.hasVLX()) \|\| InBits == 512) {
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	// There's one case we can widen to 512 bits and use VTRUNC.
	if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
	In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
	DAG.getUNDEF(MVT::v4i64));
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	}
	if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
	getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
	isTypeLegal(MVT::v4i64)) {
	// Input needs to be split and output needs to widened. Let's use two
	// VTRUNCs, and shuffle their results together into the wider type.
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

	Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
	Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
	SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
	{ 0, 1, 2, 3, 16, 17, 18, 19,
	-1, -1, -1, -1, -1, -1, -1, -1 });
	Results.push_back(Res);
	return;
	}

	return;
	}
	case ISD::ANY_EXTEND:
	// Right now, only MVT::v8i8 has Custom action for an illegal type.
	// It's intended to custom handle the input type.
	assert(N->getValueType(0) == MVT::v8i8 &&
	"Do not know how to legalize this Node");
	return;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v4i16 \|\| InVT == MVT::v4i8)){
	assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
	"Unexpected type action!");
	assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to v2i64.
	SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{0, 4, 1, 5});
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
	SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{2, 6, 3, 7});
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (VT == MVT::v16i32 \|\| VT == MVT::v8i64) {
	if (!InVT.is128BitVector()) {
	// Not a 128 bit vector, but maybe type legalization will promote
	// it to 128 bits.
	if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
	return;
	InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
	if (!InVT.is128BitVector())
	return;

	// Promote the input to 128 bits. Type legalization will turn this into
	// zext_inreg/sext_inreg.
	In = DAG.getNode(N->getOpcode(), dl, InVT, In);
	}

	// Perform custom splitting instead of the two stage extend we would get
	// by default.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	assert(isTypeLegal(LoVT) && "Split VT not legal?");

	SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);

	// We need to shift the input over by half the number of elements.
	unsigned NumElts = InVT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != HalfNumElts; ++i)
	ShufMask[i] = i + HalfNumElts;

	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::STRICT_FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::STRICT_FP_TO_UINT: {
	bool IsStrict = N->isStrictFPOpcode();
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT \|\|
	N->getOpcode() == ISD::STRICT_FP_TO_SINT;
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();

	if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");

	// Try to create a 128 bit vector, but don't exceed a 32 bit element.
	unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
	MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
	VT.getVectorNumElements());
	SDValue Res;
	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

	// Preserve what we know about the size of the original result. Except
	// when the result is v2i32 since we can't widen the assert.
	if (PromoteVT != MVT::v2i32)
	Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
	dl, PromoteVT, Res,
	DAG.getValueType(VT.getVectorElementType()));

	// Truncate back to the original width.
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

	// Now widen to 128 bits.
	unsigned NumConcats = 128 / VT.getSizeInBits();
	MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	VT.getVectorNumElements() * NumConcats);
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}


	if (VT == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	if (Src.getValueType() == MVT::v2f64) {
	unsigned Opc;
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

	// If we have VLX we can emit a target specific FP_TO_UINT node,.
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Otherwise we can defer to the generic legalizer which will widen
	// the input as well. This will be further widened during op
	// legalization to v8i32<-v8f64.
	// For strict nodes we'll need to widen ourselves.
	// FIXME: Fix the type legalizer to safely widen strict nodes?
	if (!IsStrict)
	return;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
	DAG.getConstantFP(0.0, dl, MVT::v2f64));
	Opc = N->getOpcode();
	}
	SDValue Res;
	SDValue Chain;
	if (IsStrict) {
	Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Res.getValue(1);
	} else {
	Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
	}
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	// Custom widen strict v2f32->v2i32 by padding with zeros.
	// FIXME: Should generic type legalizer do this?
	if (Src.getValueType() == MVT::v2f32 && IsStrict) {
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getConstantFP(0.0, dl, MVT::v2f32));
	SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	assert(!VT.isVector() && "Vectors should have been handled above!");

	if (Subtarget.hasDQI() && VT == MVT::i64 &&
	(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
	assert(!Subtarget.is64Bit() && "i64 should be legal");
	unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
	// If we use a 128-bit result we might need to use a target specific node.
	unsigned SrcElts =
	std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
	MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
	unsigned Opc = N->getOpcode();
	if (NumElts != SrcElts) {
	if (IsStrict)
	Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
	else
	Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	}

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
	DAG.getConstantFP(0.0, dl, VecInVT), Src,
	ZeroIdx);
	SDValue Chain;
	if (IsStrict) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
	Chain = Res.getValue(1);
	} else
	Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
	Results.push_back(Res);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	SDValue Chain;
	if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
	Results.push_back(V);
	if (IsStrict)
	Results.push_back(Chain);
	}
	return;
	}
	case ISD::LRINT:
	case ISD::LLRINT: {
	if (SDValue V = LRINT_LLRINTHelper(N, DAG))
	Results.push_back(V);
	return;
	}

	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP: {
	bool IsStrict = N->isStrictFPOpcode();
	bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP \|\|
	N->getOpcode() == ISD::STRICT_SINT_TO_FP;
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	if (IsStrict) {
	unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
	: X86ISD::STRICT_CVTUI2P;
	SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	} else {
	unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
	Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
	}
	return;
	}
	if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
	Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
	SDValue Zero = DAG.getConstant(0, dl, SrcVT);
	SDValue One = DAG.getConstant(1, dl, SrcVT);
	SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
	DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
	DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
	SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
	SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
	SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
	for (int i = 0; i != 2; ++i) {
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
	SignSrc, DAG.getIntPtrConstant(i, dl));
	if (IsStrict)
	SignCvts[i] =
	DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
	{N->getOperand(0), Elt});
	else
	SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
	};
	SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
	SDValue Slow, Chain;
	if (IsStrict) {
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	SignCvts[0].getValue(1), SignCvts[1].getValue(1));
	Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
	{Chain, SignCvt, SignCvt});
	Chain = Slow.getValue(1);
	} else {
	Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
	}
	IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
	IsNeg =
	DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
	SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
	Results.push_back(Cvt);
	if (IsStrict)
	Results.push_back(Chain);
	return;
	}

	if (SrcVT != MVT::v2i32)
	return;

	if (IsSigned \|\| Subtarget.hasAVX512()) {
	if (!IsStrict)
	return;

	// Custom widen strict v2i32->v2f32 to avoid scalarization.
	// FIXME: Should generic type legalizer do this?
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getConstant(0, dl, MVT::v2i32));
	SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), Src});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}

	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	if (IsStrict) {
	SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
	{N->getOperand(0), Or, VBias});
	SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
	{MVT::v4f32, MVT::Other},
	{Sub.getValue(1), Sub});
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	} else {
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	}
	return;
	}
	case ISD::STRICT_FP_ROUND:
	case ISD::FP_ROUND: {
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	if (!isTypeLegal(Src.getValueType()))
	return;
	SDValue V;
	if (IsStrict)
	V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), N->getOperand(1)});
	else
	V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	if (IsStrict)
	Results.push_back(V.getValue(1));
	return;
	}
	case ISD::FP_EXTEND:
	case ISD::STRICT_FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = N->getConstantOperandVal(1);
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
	Results);
	return;
	case Intrinsic::x86_xgetbv:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
	Results);
	return;
	}
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	assert((!Regs64bit \|\| Subtarget.hasCmpxchg16b()) &&
	"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	Register BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_LOAD: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	auto *Node = cast<AtomicSDNode>(N);
	if (Subtarget.hasSSE1()) {
	// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
	// Then extract the lower 64-bits.
	MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
	SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	if (Subtarget.hasSSE2()) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	// We use an alternative sequence for SSE1 that extracts as v2f32 and
	// then casts to i64. This avoids a 128-bit stack temporary being
	// created by type legalization if we were to cast v4f32->v2i64.
	SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
	DAG.getIntPtrConstant(0, dl));
	Res = DAG.getBitcast(MVT::i64, Res);
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register. This will put the whole
	// integer into the significand.
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
	dl, Tys, Ops, MVT::i64,
	Node->getMemOperand());
	SDValue Chain = Result.getValue(1);

	// Now store the X87 register to a stack temporary and convert to i64.
	// This store is not atomic and doesn't need to be.
	// FIXME: We don't need a stack temporary if the result of the load
	// is already being stored. We could just directly store there.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	SDValue StoreOps[] = { Chain, Result, StackPtr };
	Chain = DAG.getMemIntrinsicNode(
	X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
	MPI, None /Align/, MachineMemOperand::MOStore);

	// Finally load the value back from the stack temporary and return it.
	// This load is not atomic and doesn't need to be.
	// This load will be further type legalized.
	Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
	Results.push_back(Result);
	Results.push_back(Result.getValue(1));
	return;
	}
	}
	// TODO: Use MOVLPS when SSE1 is available?
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;

	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
	// we can split using the k-register rather than memory.
	if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	Lo = DAG.getBitcast(MVT::i32, Lo);
	Hi = DAG.getBitcast(MVT::i32, Hi);
	SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
	// FIXME: Use v4f32 for SSE1?
	assert(Subtarget.hasSSE2() && "Requires SSE2");
	assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
	"Unexpected type action!");
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
	SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
	N->getOperand(0));
	Res = DAG.getBitcast(WideVT, Res);
	Results.push_back(Res);
	return;
	}

	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if ((VT == MVT::v2f32 \|\| VT == MVT::v2i32) &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
	Gather->getPassThru(),
	DAG.getUNDEF(VT));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getMemIntrinsicNode(
	X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	return;
	}
	case ISD::LOAD: {
	// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
	// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
	// cast since type legalization will try to use an i64 load.
	MVT VT = N->getSimpleValueType(0);
	assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
	assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	"Unexpected type action!");
	if (!ISD::isNON_EXTLoad(N))
	return;
	auto *Ld = cast<LoadSDNode>(N);
	if (Subtarget.hasSSE2()) {
	MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
	SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue Chain = Res.getValue(1);
	MVT VecVT = MVT::getVectorVT(LdVT, 2);
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
	EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
	Res = DAG.getBitcast(WideVT, Res);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
	SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Ld->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	case ISD::ADDRSPACECAST: {
	SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
	NODE_NAME_CASE(BSF)
	NODE_NAME_CASE(BSR)
	NODE_NAME_CASE(FSHL)
	NODE_NAME_CASE(FSHR)
	NODE_NAME_CASE(FAND)
	NODE_NAME_CASE(FANDN)
	NODE_NAME_CASE(FOR)
	NODE_NAME_CASE(FXOR)
	NODE_NAME_CASE(FILD)
	NODE_NAME_CASE(FIST)
	NODE_NAME_CASE(FP_TO_INT_IN_MEM)
	NODE_NAME_CASE(FLD)
	NODE_NAME_CASE(FST)
	NODE_NAME_CASE(CALL)
	NODE_NAME_CASE(BT)
	NODE_NAME_CASE(CMP)
	NODE_NAME_CASE(FCMP)
	NODE_NAME_CASE(STRICT_FCMP)
	NODE_NAME_CASE(STRICT_FCMPS)
	NODE_NAME_CASE(COMI)
	NODE_NAME_CASE(UCOMI)
	NODE_NAME_CASE(CMPM)
	NODE_NAME_CASE(STRICT_CMPM)
	NODE_NAME_CASE(CMPM_SAE)
	NODE_NAME_CASE(SETCC)
	NODE_NAME_CASE(SETCC_CARRY)
	NODE_NAME_CASE(FSETCC)
	NODE_NAME_CASE(FSETCCM)
	NODE_NAME_CASE(FSETCCM_SAE)
	NODE_NAME_CASE(CMOV)
	NODE_NAME_CASE(BRCOND)
	NODE_NAME_CASE(RET_FLAG)
	NODE_NAME_CASE(IRET)
	NODE_NAME_CASE(REP_STOS)
	NODE_NAME_CASE(REP_MOVS)
	NODE_NAME_CASE(GlobalBaseReg)
	NODE_NAME_CASE(Wrapper)
	NODE_NAME_CASE(WrapperRIP)
	NODE_NAME_CASE(MOVQ2DQ)
	NODE_NAME_CASE(MOVDQ2Q)
	NODE_NAME_CASE(MMX_MOVD2W)
	NODE_NAME_CASE(MMX_MOVW2D)
	NODE_NAME_CASE(PEXTRB)
	NODE_NAME_CASE(PEXTRW)
	NODE_NAME_CASE(INSERTPS)
	NODE_NAME_CASE(PINSRB)
	NODE_NAME_CASE(PINSRW)
	NODE_NAME_CASE(PSHUFB)
	NODE_NAME_CASE(ANDNP)
	NODE_NAME_CASE(BLENDI)
	NODE_NAME_CASE(BLENDV)
	NODE_NAME_CASE(HADD)
	NODE_NAME_CASE(HSUB)
	NODE_NAME_CASE(FHADD)
	NODE_NAME_CASE(FHSUB)
	NODE_NAME_CASE(CONFLICT)
	NODE_NAME_CASE(FMAX)
	NODE_NAME_CASE(FMAXS)
	NODE_NAME_CASE(FMAX_SAE)
	NODE_NAME_CASE(FMAXS_SAE)
	NODE_NAME_CASE(FMIN)
	NODE_NAME_CASE(FMINS)
	NODE_NAME_CASE(FMIN_SAE)
	NODE_NAME_CASE(FMINS_SAE)
	NODE_NAME_CASE(FMAXC)
	NODE_NAME_CASE(FMINC)
	NODE_NAME_CASE(FRSQRT)
	NODE_NAME_CASE(FRCP)
	NODE_NAME_CASE(EXTRQI)
	NODE_NAME_CASE(INSERTQI)
	NODE_NAME_CASE(TLSADDR)
	NODE_NAME_CASE(TLSBASEADDR)
	NODE_NAME_CASE(TLSCALL)
	NODE_NAME_CASE(EH_SJLJ_SETJMP)
	NODE_NAME_CASE(EH_SJLJ_LONGJMP)
	NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
	NODE_NAME_CASE(EH_RETURN)
	NODE_NAME_CASE(TC_RETURN)
	NODE_NAME_CASE(FNSTCW16m)
	NODE_NAME_CASE(LCMPXCHG_DAG)
	NODE_NAME_CASE(LCMPXCHG8_DAG)
	NODE_NAME_CASE(LCMPXCHG16_DAG)
	NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
	NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
	NODE_NAME_CASE(LADD)
	NODE_NAME_CASE(LSUB)
	NODE_NAME_CASE(LOR)
	NODE_NAME_CASE(LXOR)
	NODE_NAME_CASE(LAND)
	NODE_NAME_CASE(VZEXT_MOVL)
	NODE_NAME_CASE(VZEXT_LOAD)
	NODE_NAME_CASE(VEXTRACT_STORE)
	NODE_NAME_CASE(VTRUNC)
	NODE_NAME_CASE(VTRUNCS)
	NODE_NAME_CASE(VTRUNCUS)
	NODE_NAME_CASE(VMTRUNC)
	NODE_NAME_CASE(VMTRUNCS)
	NODE_NAME_CASE(VMTRUNCUS)
	NODE_NAME_CASE(VTRUNCSTORES)
	NODE_NAME_CASE(VTRUNCSTOREUS)
	NODE_NAME_CASE(VMTRUNCSTORES)
	NODE_NAME_CASE(VMTRUNCSTOREUS)
	NODE_NAME_CASE(VFPEXT)
	NODE_NAME_CASE(STRICT_VFPEXT)
	NODE_NAME_CASE(VFPEXT_SAE)
	NODE_NAME_CASE(VFPEXTS)
	NODE_NAME_CASE(VFPEXTS_SAE)
	NODE_NAME_CASE(VFPROUND)
	NODE_NAME_CASE(STRICT_VFPROUND)
	NODE_NAME_CASE(VMFPROUND)
	NODE_NAME_CASE(VFPROUND_RND)
	NODE_NAME_CASE(VFPROUNDS)
	NODE_NAME_CASE(VFPROUNDS_RND)
	NODE_NAME_CASE(VSHLDQ)
	NODE_NAME_CASE(VSRLDQ)
	NODE_NAME_CASE(VSHL)
	NODE_NAME_CASE(VSRL)
	NODE_NAME_CASE(VSRA)
	NODE_NAME_CASE(VSHLI)
	NODE_NAME_CASE(VSRLI)
	NODE_NAME_CASE(VSRAI)
	NODE_NAME_CASE(VSHLV)
	NODE_NAME_CASE(VSRLV)
	NODE_NAME_CASE(VSRAV)
	NODE_NAME_CASE(VROTLI)
	NODE_NAME_CASE(VROTRI)
	NODE_NAME_CASE(VPPERM)
	NODE_NAME_CASE(CMPP)
	NODE_NAME_CASE(STRICT_CMPP)
	NODE_NAME_CASE(PCMPEQ)
	NODE_NAME_CASE(PCMPGT)
	NODE_NAME_CASE(PHMINPOS)
	NODE_NAME_CASE(ADD)
	NODE_NAME_CASE(SUB)
	NODE_NAME_CASE(ADC)
	NODE_NAME_CASE(SBB)
	NODE_NAME_CASE(SMUL)
	NODE_NAME_CASE(UMUL)
	NODE_NAME_CASE(OR)
	NODE_NAME_CASE(XOR)
	NODE_NAME_CASE(AND)
	NODE_NAME_CASE(BEXTR)
	NODE_NAME_CASE(BZHI)
	NODE_NAME_CASE(PDEP)
	NODE_NAME_CASE(PEXT)
	NODE_NAME_CASE(MUL_IMM)
	NODE_NAME_CASE(MOVMSK)
	NODE_NAME_CASE(PTEST)
	NODE_NAME_CASE(TESTP)
	NODE_NAME_CASE(KORTEST)
	NODE_NAME_CASE(KTEST)
	NODE_NAME_CASE(KADD)
	NODE_NAME_CASE(KSHIFTL)
	NODE_NAME_CASE(KSHIFTR)
	NODE_NAME_CASE(PACKSS)
	NODE_NAME_CASE(PACKUS)
	NODE_NAME_CASE(PALIGNR)
	NODE_NAME_CASE(VALIGN)
	NODE_NAME_CASE(VSHLD)
	NODE_NAME_CASE(VSHRD)
	NODE_NAME_CASE(VSHLDV)
	NODE_NAME_CASE(VSHRDV)
	NODE_NAME_CASE(PSHUFD)
	NODE_NAME_CASE(PSHUFHW)
	NODE_NAME_CASE(PSHUFLW)
	NODE_NAME_CASE(SHUFP)
	NODE_NAME_CASE(SHUF128)
	NODE_NAME_CASE(MOVLHPS)
	NODE_NAME_CASE(MOVHLPS)
	NODE_NAME_CASE(MOVDDUP)
	NODE_NAME_CASE(MOVSHDUP)
	NODE_NAME_CASE(MOVSLDUP)
	NODE_NAME_CASE(MOVSD)
	NODE_NAME_CASE(MOVSS)
	NODE_NAME_CASE(UNPCKL)
	NODE_NAME_CASE(UNPCKH)
	NODE_NAME_CASE(VBROADCAST)
	NODE_NAME_CASE(VBROADCAST_LOAD)
	NODE_NAME_CASE(VBROADCASTM)
	NODE_NAME_CASE(SUBV_BROADCAST)
	NODE_NAME_CASE(VPERMILPV)
	NODE_NAME_CASE(VPERMILPI)
	NODE_NAME_CASE(VPERM2X128)
	NODE_NAME_CASE(VPERMV)
	NODE_NAME_CASE(VPERMV3)
	NODE_NAME_CASE(VPERMI)
	NODE_NAME_CASE(VPTERNLOG)
	NODE_NAME_CASE(VFIXUPIMM)
	NODE_NAME_CASE(VFIXUPIMM_SAE)
	NODE_NAME_CASE(VFIXUPIMMS)
	NODE_NAME_CASE(VFIXUPIMMS_SAE)
	NODE_NAME_CASE(VRANGE)
	NODE_NAME_CASE(VRANGE_SAE)
	NODE_NAME_CASE(VRANGES)
	NODE_NAME_CASE(VRANGES_SAE)
	NODE_NAME_CASE(PMULUDQ)
	NODE_NAME_CASE(PMULDQ)
	NODE_NAME_CASE(PSADBW)
	NODE_NAME_CASE(DBPSADBW)
	NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
	NODE_NAME_CASE(VAARG_64)
	NODE_NAME_CASE(WIN_ALLOCA)
	NODE_NAME_CASE(MEMBARRIER)
	NODE_NAME_CASE(MFENCE)
	NODE_NAME_CASE(SEG_ALLOCA)
	NODE_NAME_CASE(PROBED_ALLOCA)
	NODE_NAME_CASE(RDRAND)
	NODE_NAME_CASE(RDSEED)
	NODE_NAME_CASE(RDPKRU)
	NODE_NAME_CASE(WRPKRU)
	NODE_NAME_CASE(VPMADDUBSW)
	NODE_NAME_CASE(VPMADDWD)
	NODE_NAME_CASE(VPSHA)
	NODE_NAME_CASE(VPSHL)
	NODE_NAME_CASE(VPCOM)
	NODE_NAME_CASE(VPCOMU)
	NODE_NAME_CASE(VPERMIL2)
	NODE_NAME_CASE(FMSUB)
	NODE_NAME_CASE(STRICT_FMSUB)
	NODE_NAME_CASE(FNMADD)
	NODE_NAME_CASE(STRICT_FNMADD)
	NODE_NAME_CASE(FNMSUB)
	NODE_NAME_CASE(STRICT_FNMSUB)
	NODE_NAME_CASE(FMADDSUB)
	NODE_NAME_CASE(FMSUBADD)
	NODE_NAME_CASE(FMADD_RND)
	NODE_NAME_CASE(FNMADD_RND)
	NODE_NAME_CASE(FMSUB_RND)
	NODE_NAME_CASE(FNMSUB_RND)
	NODE_NAME_CASE(FMADDSUB_RND)
	NODE_NAME_CASE(FMSUBADD_RND)
	NODE_NAME_CASE(VPMADD52H)
	NODE_NAME_CASE(VPMADD52L)
	NODE_NAME_CASE(VRNDSCALE)
	NODE_NAME_CASE(STRICT_VRNDSCALE)
	NODE_NAME_CASE(VRNDSCALE_SAE)
	NODE_NAME_CASE(VRNDSCALES)
	NODE_NAME_CASE(VRNDSCALES_SAE)
	NODE_NAME_CASE(VREDUCE)
	NODE_NAME_CASE(VREDUCE_SAE)
	NODE_NAME_CASE(VREDUCES)
	NODE_NAME_CASE(VREDUCES_SAE)
	NODE_NAME_CASE(VGETMANT)
	NODE_NAME_CASE(VGETMANT_SAE)
	NODE_NAME_CASE(VGETMANTS)
	NODE_NAME_CASE(VGETMANTS_SAE)
	NODE_NAME_CASE(PCMPESTR)
	NODE_NAME_CASE(PCMPISTR)
	NODE_NAME_CASE(XTEST)
	NODE_NAME_CASE(COMPRESS)
	NODE_NAME_CASE(EXPAND)
	NODE_NAME_CASE(SELECTS)
	NODE_NAME_CASE(ADDSUB)
	NODE_NAME_CASE(RCP14)
	NODE_NAME_CASE(RCP14S)
	NODE_NAME_CASE(RCP28)
	NODE_NAME_CASE(RCP28_SAE)
	NODE_NAME_CASE(RCP28S)
	NODE_NAME_CASE(RCP28S_SAE)
	NODE_NAME_CASE(EXP2)
	NODE_NAME_CASE(EXP2_SAE)
	NODE_NAME_CASE(RSQRT14)
	NODE_NAME_CASE(RSQRT14S)
	NODE_NAME_CASE(RSQRT28)
	NODE_NAME_CASE(RSQRT28_SAE)
	NODE_NAME_CASE(RSQRT28S)
	NODE_NAME_CASE(RSQRT28S_SAE)
	NODE_NAME_CASE(FADD_RND)
	NODE_NAME_CASE(FADDS)
	NODE_NAME_CASE(FADDS_RND)
	NODE_NAME_CASE(FSUB_RND)
	NODE_NAME_CASE(FSUBS)
	NODE_NAME_CASE(FSUBS_RND)
	NODE_NAME_CASE(FMUL_RND)
	NODE_NAME_CASE(FMULS)
	NODE_NAME_CASE(FMULS_RND)
	NODE_NAME_CASE(FDIV_RND)
	NODE_NAME_CASE(FDIVS)
	NODE_NAME_CASE(FDIVS_RND)
	NODE_NAME_CASE(FSQRT_RND)
	NODE_NAME_CASE(FSQRTS)
	NODE_NAME_CASE(FSQRTS_RND)
	NODE_NAME_CASE(FGETEXP)
	NODE_NAME_CASE(FGETEXP_SAE)
	NODE_NAME_CASE(FGETEXPS)
	NODE_NAME_CASE(FGETEXPS_SAE)
	NODE_NAME_CASE(SCALEF)
	NODE_NAME_CASE(SCALEF_RND)
	NODE_NAME_CASE(SCALEFS)
	NODE_NAME_CASE(SCALEFS_RND)
	NODE_NAME_CASE(AVG)
	NODE_NAME_CASE(MULHRS)
	NODE_NAME_CASE(SINT_TO_FP_RND)
	NODE_NAME_CASE(UINT_TO_FP_RND)
	NODE_NAME_CASE(CVTTP2SI)
	NODE_NAME_CASE(CVTTP2UI)
	NODE_NAME_CASE(STRICT_CVTTP2SI)
	NODE_NAME_CASE(STRICT_CVTTP2UI)
	NODE_NAME_CASE(MCVTTP2SI)
	NODE_NAME_CASE(MCVTTP2UI)
	NODE_NAME_CASE(CVTTP2SI_SAE)
	NODE_NAME_CASE(CVTTP2UI_SAE)
	NODE_NAME_CASE(CVTTS2SI)
	NODE_NAME_CASE(CVTTS2UI)
	NODE_NAME_CASE(CVTTS2SI_SAE)
	NODE_NAME_CASE(CVTTS2UI_SAE)
	NODE_NAME_CASE(CVTSI2P)
	NODE_NAME_CASE(CVTUI2P)
	NODE_NAME_CASE(STRICT_CVTSI2P)
	NODE_NAME_CASE(STRICT_CVTUI2P)
	NODE_NAME_CASE(MCVTSI2P)
	NODE_NAME_CASE(MCVTUI2P)
	NODE_NAME_CASE(VFPCLASS)
	NODE_NAME_CASE(VFPCLASSS)
	NODE_NAME_CASE(MULTISHIFT)
	NODE_NAME_CASE(SCALAR_SINT_TO_FP)
	NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
	NODE_NAME_CASE(SCALAR_UINT_TO_FP)
	NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
	NODE_NAME_CASE(CVTPS2PH)
	NODE_NAME_CASE(STRICT_CVTPS2PH)
	NODE_NAME_CASE(MCVTPS2PH)
	NODE_NAME_CASE(CVTPH2PS)
	NODE_NAME_CASE(STRICT_CVTPH2PS)
	NODE_NAME_CASE(CVTPH2PS_SAE)
	NODE_NAME_CASE(CVTP2SI)
	NODE_NAME_CASE(CVTP2UI)
	NODE_NAME_CASE(MCVTP2SI)
	NODE_NAME_CASE(MCVTP2UI)
	NODE_NAME_CASE(CVTP2SI_RND)
	NODE_NAME_CASE(CVTP2UI_RND)
	NODE_NAME_CASE(CVTS2SI)
	NODE_NAME_CASE(CVTS2UI)
	NODE_NAME_CASE(CVTS2SI_RND)
	NODE_NAME_CASE(CVTS2UI_RND)
	NODE_NAME_CASE(CVTNE2PS2BF16)
	NODE_NAME_CASE(CVTNEPS2BF16)
	NODE_NAME_CASE(MCVTNEPS2BF16)
	NODE_NAME_CASE(DPBF16PS)
	NODE_NAME_CASE(LWPINS)
	NODE_NAME_CASE(MGATHER)
	NODE_NAME_CASE(MSCATTER)
	NODE_NAME_CASE(VPDPBUSD)
	NODE_NAME_CASE(VPDPBUSDS)
	NODE_NAME_CASE(VPDPWSSD)
	NODE_NAME_CASE(VPDPWSSDS)
	NODE_NAME_CASE(VPSHUFBITQMB)
	NODE_NAME_CASE(GF2P8MULB)
	NODE_NAME_CASE(GF2P8AFFINEQB)
	NODE_NAME_CASE(GF2P8AFFINEINVQB)
	NODE_NAME_CASE(NT_CALL)
	NODE_NAME_CASE(NT_BRIND)
	NODE_NAME_CASE(UMWAIT)
	NODE_NAME_CASE(TPAUSE)
	NODE_NAME_CASE(ENQCMD)
	NODE_NAME_CASE(ENQCMDS)
	NODE_NAME_CASE(VP2INTERSECT)
	}
	return nullptr;
	#undef NODE_NAME_CASE
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
	// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
	if (Subtarget.hasXOP() &&
	(Bits == 8 \|\| Bits == 16 \|\| Bits == 32 \|\| Bits == 64))
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// AVX512BW has shifts such as vpsllvw.
	if (Subtarget.hasBWI() && Bits == 16)
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// These are non-commutative binops.
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::ANDNP:
	case X86ISD::PCMPGT:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case X86ISD::FANDN:
	return true;
	}

	return TargetLoweringBase::isBinOp(Opcode);
	}

	bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::PCMPEQ:
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ:
	case X86ISD::FMAXC:
	case X86ISD::FMINC:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR:
	return true;
	}

	return TargetLoweringBase::isCommutativeBinOp(Opcode);
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isScalarInteger() \|\| !VT2.isScalarInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::shouldSinkOperands(Instruction *I,
	SmallVectorImpl<Use *> &Ops) const {
	// A uniform shift amount in a vector shift or funnel shift may be much
	// cheaper than a generic variable vector shift, so make that pattern visible
	// to SDAG by sinking the shuffle instruction next to the shift.
	int ShiftAmountOpNum = -1;
	if (I->isShift())
	ShiftAmountOpNum = 1;
	else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
	if (II->getIntrinsicID() == Intrinsic::fshl \|\|
	II->getIntrinsicID() == Intrinsic::fshr)
	ShiftAmountOpNum = 2;
	}

	if (ShiftAmountOpNum == -1)
	return false;

	auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
	if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
	isVectorShiftByScalarCheap(I->getType())) {
	Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
	return true;
	}

	return false;
	}

	bool X86TargetLowering::shouldConvertPhiType(Type From, Type To) const {
	if (!Subtarget.is64Bit())
	return false;
	return TargetLowering::shouldConvertPhiType(From, To);
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
	return false;

	EVT SrcVT = ExtVal.getOperand(0).getValueType();

	// There is no extending load for vXi1.
	if (SrcVT.getScalarType() == MVT::i1)
	return false;

	return true;
	}

	bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
	EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
	EVT VT) const {
	// Don't convert an 'and' into a shuffle that we don't directly support.
	// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
	if (!Subtarget.hasAVX2())
	if (VT == MVT::v32i8 \|\| VT == MVT::v16i16)
	return false;

	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	// If the subtarget is using thunks, we need to not generate jump tables.
	if (Subtarget.useIndirectThunkBranches())
	return false;

	// Otherwise, fallback on the generic logic.
	return TargetLowering::areJTsAllowed(Fn);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	// Returns true if EFLAG is consumed after this iterator in the rest of the
	// basic block or any successors of the basic block.
	static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
	MachineBasicBlock *BB) {
	// Scan forward through BB for a use/def of EFLAGS.
	for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
	miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return true;
	// If we found a def, we can stop searching.
	if (mi.definesRegister(X86::EFLAGS))
	return false;
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return true;
	}

	return false;
	}

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	if (isEFLAGSLiveAfter(MI, MBB)) {
	mainMBB->addLiveIn(X86::EFLAGS);
	fallMBB->addLiveIn(X86::EFLAGS);
	sinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	Register DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	Register DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	Align Alignment = Align(MI.getOperand(8).getImm());

	MachineFunction *MF = MBB->getParent();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");

	MachineMemOperand *OldMMO = MI.memoperands().front();

	// Clone the MMO into two separate MMOs for loading and storing
	MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
	MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Alignment > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
	.addMBB(overflowMBB).addImm(X86::COND_AE);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Zero-extend the offset
	Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(StoreOnlyMMO);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Alignment.value() - 1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Alignment.value() - 1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(StoreOnlyMMO);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	Register CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, Align(16));
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	if (isEFLAGSLiveAfter(SelectItr, BB))
	return false;

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR64:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK1:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	Register DestReg = MIIt->getOperand(0).getReg();
	Register Op1Reg = MIIt->getOperand(1).getReg();
	Register Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	Register DestReg = FirstCMOV.getOperand(0).getReg();
	Register Op1Reg = FirstCMOV.getOperand(1).getReg();
	Register Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition. Skip over
	// intervening debug insts.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer any debug instructions inside the CMOV sequence to the sunk block.
	auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
	auto DbgIt = MachineBasicBlock::iterator(MI);
	while (DbgIt != DbgEnd) {
	auto Next = std::next(DbgIt);
	if (DbgIt->isDebugInstr())
	SinkMBB->push_back(DbgIt->removeFromParent());
	DbgIt = Next;
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->end(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::SUB64ri8;
	return X86::SUB64ri32;
	} else {
	if (isInt<8>(Imm))
	return X86::SUB32ri8;
	return X86::SUB32ri;
	}
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();

	const unsigned ProbeSize = getStackProbeSize(*MF);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MF->insert(MBBIter, testMBB);
	MF->insert(MBBIter, blockMBB);
	MF->insert(MBBIter, tailMBB);

	Register sizeVReg = MI.getOperand(1).getReg();

	Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

	Register TmpStackPtr = MRI.createVirtualRegister(
	TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
	Register FinalStackPtr = MRI.createVirtualRegister(
	TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

	BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
	.addReg(physSPReg);
	{
	const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
	.addReg(TmpStackPtr)
	.addReg(sizeVReg);
	}

	// test rsp size

	BuildMI(testMBB, DL,
	TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
	.addReg(FinalStackPtr)
	.addReg(physSPReg);

	BuildMI(testMBB, DL, TII->get(X86::JCC_1))
	.addMBB(tailMBB)
	.addImm(X86::COND_L);
	testMBB->addSuccessor(blockMBB);
	testMBB->addSuccessor(tailMBB);

	// Touch the block then extend it. This is done on the opposite side of
	// static probe where we allocate then touch, to avoid the need of probing the
	// tail of the static alloca. Possible scenarios are:
	//
	// + ---- <- ------------ <- ------------- <- ------------ +
	// \| \|
	// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
	// \| \|
	// + <- ----------- <- ------------ <- ----------- <- ------------ +
	//
	// The property we want to enforce is to never have more than [page alloc] between two probes.

	const unsigned MovMIOpc =
	TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
	addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
	.addImm(0);

	BuildMI(blockMBB, DL,
	TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
	.addReg(physSPReg)
	.addImm(ProbeSize);


	BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
	blockMBB->addSuccessor(testMBB);

	// Replace original instruction by the expected stack ptr
	BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(FinalStackPtr);

	tailMBB->splice(tailMBB->end(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
	MBB->addSuccessor(testMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return tailMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	// Marking this as an EH pad but not a funclet entry block causes PEI to
	// restore stack pointers in the block.
	RestoreMBB->setIsEHPad(true);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
	switch (RPOpc) {
	case X86::INDIRECT_THUNK_CALL32:
	return X86::CALLpcrel32;
	case X86::INDIRECT_THUNK_CALL64:
	return X86::CALL64pcrel32;
	case X86::INDIRECT_THUNK_TCRETURN32:
	return X86::TCRETURNdi;
	case X86::INDIRECT_THUNK_TCRETURN64:
	return X86::TCRETURNdi64;
	}
	llvm_unreachable("not indirect thunk opcode");
	}

	static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
	unsigned Reg) {
	if (Subtarget.useRetpolineExternalThunk()) {
	// When using an external thunk for retpolines, we pick names that match the
	// names GCC happens to use as well. This helps simplify the implementation
	// of the thunks for kernels where they have no easy ability to create
	// aliases and are doing non-trivial configuration of the thunk's body. For
	// example, the Linux kernel will do boot-time hot patching of the thunk
	// bodies and cannot easily export aliases of these to loaded modules.
	//
	// Note that at any point in the future, we may need to change the semantics
	// of how we implement retpolines and at that time will likely change the
	// name of the called thunk. Essentially, there is no hard guarantee that
	// LLVM will generate calls to specific thunks, we merely make a best-effort
	// attempt to help out kernels and other systems where duplicating the
	// thunks is costly.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__x86_indirect_thunk_r11";
	}
	llvm_unreachable("unexpected reg for external indirect thunk");
	}

	if (Subtarget.useRetpolineIndirectCalls() \|\|
	Subtarget.useRetpolineIndirectBranches()) {
	// When targeting an internal COMDAT thunk use an LLVM-specific name.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__llvm_retpoline_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	if (Subtarget.useLVIControlFlowIntegrity()) {
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__llvm_lvi_thunk_r11";
	}
	llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Copy the virtual register into the R11 physical register and
	// call the retpoline thunk.
	DebugLoc DL = MI.getDebugLoc();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	Register CalleeVReg = MI.getOperand(0).getReg();
	unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

	// Find an available scratch register to hold the callee. On 64-bit, we can
	// just use R11, but we scan for uses anyway to ensure we don't generate
	// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	// already a register use operand to the call to hold the callee. If none
	// are available, use EDI instead. EDI is chosen because EBX is the PIC base
	// register and ESI is the base pointer to realigned stack frames with VLAs.
	SmallVector<unsigned, 3> AvailableRegs;
	if (Subtarget.is64Bit())
	AvailableRegs.push_back(X86::R11);
	else
	AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

	// Zero out any registers that are already used.
	for (const auto &MO : MI.operands()) {
	if (MO.isReg() && MO.isUse())
	for (unsigned &Reg : AvailableRegs)
	if (Reg == MO.getReg())
	Reg = 0;
	}

	// Choose the first remaining non-zero available register.
	unsigned AvailableReg = 0;
	for (unsigned MaybeReg : AvailableRegs) {
	if (MaybeReg) {
	AvailableReg = MaybeReg;
	break;
	}
	}
	if (!AvailableReg)
	report_fatal_error("calling convention incompatible with retpoline, no "
	"available registers");

	const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	.addReg(CalleeVReg);
	MI.getOperand(0).ChangeToES(Symbol);
	MI.setDesc(TII->get(Opc));
	MachineInstrBuilder(*BB->getParent(), &MI)
	.addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	return BB;
	}

	/// SetJmp implies future control flow change upon calling the corresponding
	/// LongJmp.
	/// Instead of using the 'return' instruction, the long jump fixes the stack and
	/// performs an indirect branch. To do so it uses the registers that were stored
	/// in the jump buffer (when calling SetJmp).
	/// In case the shadow stack is enabled we need to fix it as well, because some
	/// return addresses will be skipped.
	/// The function will save the SSP for future fixing in the function
	/// emitLongJmpShadowStackFix.
	/// \sa emitLongJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstrBuilder MIB;

	// Memory Reference.
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	// Initialize a register with zero.
	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	Register ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Write the SSP register value to offset 3 in input memory buffer.
	unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
	const int64_t SSPOffset = 3 * PVT.getStoreSize();
	const unsigned MemOpndSlot = 1;
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	MIB.addReg(SSPCopyReg);
	MIB.setMemRefs(MMOs);
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	Register mainDstReg = MRI.createVirtualRegister(RC);
	Register restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOs);

	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	emitSetJmpShadowStackFix(MI, thisMBB);
	}

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	Register FramePtr = RegInfo->getFrameRegister(*MF);
	Register BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	/// Fix the shadow stack using the previously saved SSP pointer.
	/// \sa emitSetJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	/// \return The sink MBB that will perform the future indirect branch.
	MachineBasicBlock *
	X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

	// checkSspMBB:
	// xor vreg1, vreg1
	// rdssp vreg1
	// test vreg1, vreg1
	// je sinkMBB # Jump if Shadow Stack is not supported
	// fallMBB:
	// mov buf+24/12(%rip), vreg2
	// sub vreg1, vreg2
	// jbe sinkMBB # No need to fix the Shadow Stack
	// fixShadowMBB:
	// shr 3/2, vreg2
	// incssp vreg2 # fix the SSP according to the lower 8 bits
	// shr 8, vreg2
	// je sinkMBB
	// fixShadowLoopPrepareMBB:
	// shl vreg2
	// mov 128, vreg3
	// fixShadowLoopMBB:
	// incssp vreg3
	// dec vreg2
	// jne fixShadowLoopMBB # Iterate until you finish fixing
	// # the Shadow Stack
	// sinkMBB:

	MachineFunction::iterator I = ++MBB->getIterator();
	const BasicBlock *BB = MBB->getBasicBlock();

	MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, checkSspMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, fixShadowMBB);
	MF->insert(I, fixShadowLoopPrepareMBB);
	MF->insert(I, fixShadowLoopMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
	MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MBB->addSuccessor(checkSspMBB);

	// Initialize a register with zero.
	Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
	BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

	if (PVT == MVT::i64) {
	Register TmpZReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
	.addImm(0)
	.addReg(ZReg)
	.addImm(X86::sub_32bit);
	ZReg = TmpZReg;
	}

	// Read the current SSP Register value to the zeroed register.
	Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Check whether the result of the SSP register is zero and jump directly
	// to the sink.
	unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
	BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
	.addReg(SSPCopyReg)
	.addReg(SSPCopyReg);
	BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	checkSspMBB->addSuccessor(sinkMBB);
	checkSspMBB->addSuccessor(fallMBB);

	// Reload the previously saved SSP register value.
	Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	const int64_t SPPOffset = 3 * PVT.getStoreSize();
	MachineInstrBuilder MIB =
	BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, SPPOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Subtract the current SSP from the previous SSP.
	Register SspSubReg = MRI.createVirtualRegister(PtrRC);
	unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
	.addReg(PrevSSPReg)
	.addReg(SSPCopyReg);

	// Jump to sink in case PrevSSPReg <= SSPCopyReg.
	BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
	fallMBB->addSuccessor(sinkMBB);
	fallMBB->addSuccessor(fixShadowMBB);

	// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
	unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
	unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
	Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
	.addReg(SspSubReg)
	.addImm(Offset);

	// Increase SSP when looking only on the lower 8 bits of the delta.
	unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
	BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

	// Reset the lower 8 bits.
	Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
	.addReg(SspFirstShrReg)
	.addImm(8);

	// Jump if the result of the shift is zero.
	BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	fixShadowMBB->addSuccessor(sinkMBB);
	fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

	// Do a single shift left.
	unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
	Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
	.addReg(SspSecondShrReg);

	// Save the value 128 to a register (will be used next with incssp).
	Register Value128InReg = MRI.createVirtualRegister(PtrRC);
	unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
	.addImm(128);
	fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

	// Since incssp only looks at the lower 8 bits, we might need to do several
	// iterations of incssp until we finish fixing the shadow stack.
	Register DecReg = MRI.createVirtualRegister(PtrRC);
	Register CounterReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
	.addReg(SspAfterShlReg)
	.addMBB(fixShadowLoopPrepareMBB)
	.addReg(DecReg)
	.addMBB(fixShadowLoopMBB);

	// Every iteration we increase the SSP by 128.
	BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

	// Every iteration we decrement the counter by 1.
	unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
	BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

	// Jump if the counter is not zero yet.
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
	fixShadowLoopMBB->addSuccessor(sinkMBB);
	fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	Register Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	Register SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	MachineBasicBlock *thisMBB = MBB;

	// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
	}

	// Reload FP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, LabelOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload SP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
	// the last instruction of the expansion.
	}
	MIB.setMemRefs(MMOs);

	// Jump
	BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return thisMBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MF->getFrameInfo().getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugInstr())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	Register FP = RI.getFrameRegister(*MF);
	Register BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

	if (Subtarget.is64Bit()) {
	Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
	unsigned Reg = SavedRegs[RegIdx];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	auto TMMImmToTMMReg = [](unsigned Imm) {
	assert (Imm < 8 && "Illegal tmm index");
	return X86::TMM0 + Imm;
	};
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::INDIRECT_THUNK_CALL32:
	case X86::INDIRECT_THUNK_CALL64:
	case X86::INDIRECT_THUNK_TCRETURN32:
	case X86::INDIRECT_THUNK_TCRETURN64:
	return EmitLoweredIndirectThunk(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::PROBED_ALLOCA_32:
	case X86::PROBED_ALLOCA_64:
	return EmitLoweredProbedAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR64:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK1:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the EFLAGS and DF registers without them being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
	"Unexpected register in operand!");
	Push->getOperand(2).setIsUndef();
	assert(Push->getOperand(3).getReg() == X86::DF &&
	"Unexpected register in operand!");
	Push->getOperand(3).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int OrigCWFrameIdx =
	MF->getFrameInfo().CreateStackObject(2, Align(2), false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

	// Load the old value of the control word...
	Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
	OrigCWFrameIdx);

	// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
	Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
	.addReg(OldCW, RegState::Kill).addImm(0xC00);

	// Extract to 16 bits.
	Register NewCW16 =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
	.addReg(NewCW, RegState::Kill, X86::sub_16bit);

	// Prepare memory for FLDCW.
	int NewCWFrameIdx =
	MF->getFrameInfo().CreateStackObject(2, Align(2), false);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
	NewCWFrameIdx)
	.addReg(NewCW16, RegState::Kill);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), NewCWFrameIdx);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return emitXRayCustomEvent(MI, BB);

	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
	return emitXRayTypedEvent(MI, BB);

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
	while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) \|\|
	RMBBI->definesRegister(X86::EBX) \|\|
	RMBBI->definesRegister(X86::ECX) \|\|
	RMBBI->definesRegister(X86::EDX))) {
	++RMBBI;
	}
	MachineBasicBlock::iterator MBBI(RMBBI);
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	case TargetOpcode::PREALLOCATED_SETUP: {
	assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
	auto MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setHasPreallocatedCall(true);
	int64_t PreallocatedId = MI.getOperand(0).getImm();
	size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
	assert(StackAdjustment != 0 && "0 stack adjustment");
	LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
	<< StackAdjustment << "\n");
	BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
	.addReg(X86::ESP)
	.addImm(StackAdjustment);
	MI.eraseFromParent();
	return BB;
	}
	case TargetOpcode::PREALLOCATED_ARG: {
	assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
	int64_t PreallocatedId = MI.getOperand(1).getImm();
	int64_t ArgIdx = MI.getOperand(2).getImm();
	auto MFI = MF->getInfo<X86MachineFunctionInfo>();
	size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
	LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
	<< ", arg offset " << ArgOffset << "\n");
	// stack pointer + offset
	addRegOffset(
	BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
	X86::ESP, false, ArgOffset);
	MI.eraseFromParent();
	return BB;
	}
	case X86::PTDPBSSD:
	case X86::PTDPBSUD:
	case X86::PTDPBUSD:
	case X86::PTDPBUUD:
	case X86::PTDPBF16PS: {
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned Opc;
	switch (MI.getOpcode()) {
	case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
	case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
	case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
	case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
	case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
	}

	MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}
	case X86::PTILEZERO: {
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned Imm = MI.getOperand(0).getImm();
	BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}
	case X86::PTILELOADD:
	case X86::PTILELOADDT1:
	case X86::PTILESTORED: {
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned Opc;
	switch (MI.getOpcode()) {
	case X86::PTILELOADD: Opc = X86::TILELOADD; break;
	case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
	case X86::PTILESTORED: Opc = X86::TILESTORED; break;
	}

	MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
	unsigned CurOp = 0;
	if (Opc != X86::TILESTORED)
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
	RegState::Define);

	MIB.add(MI.getOperand(CurOp++)); // base
	MIB.add(MI.getOperand(CurOp++)); // scale
	MIB.add(MI.getOperand(CurOp++)); // index -- stride
	MIB.add(MI.getOperand(CurOp++)); // displacement
	MIB.add(MI.getOperand(CurOp++)); // segment

	if (Opc == X86::TILESTORED)
	MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
	RegState::Undef);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	bool
	X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
	const APInt &DemandedBits,
	const APInt &DemandedElts,
	TargetLoweringOpt &TLO) const {
	EVT VT = Op.getValueType();
	unsigned Opcode = Op.getOpcode();
	unsigned EltSize = VT.getScalarSizeInBits();

	if (VT.isVector()) {
	// If the constant is only all signbits in the active bits, then we should
	// extend it to the entire constant to allow it act as a boolean constant
	// vector.
	auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
	if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
	return false;
	for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
	if (!DemandedElts[i] \|\| V.getOperand(i).isUndef())
	continue;
	const APInt &Val = V.getConstantOperandAPInt(i);
	if (Val.getBitWidth() > Val.getNumSignBits() &&
	Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
	return true;
	}
	return false;
	};
	// For vectors - if we have a constant, then try to sign extend.
	// TODO: Handle AND/ANDN cases.
	unsigned ActiveBits = DemandedBits.getActiveBits();
	if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
	(Opcode == ISD::OR \|\| Opcode == ISD::XOR) &&
	NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
	EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
	EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
	VT.getVectorNumElements());
	SDValue NewC =
	TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
	Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
	SDValue NewOp =
	TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}
	return false;
	}

	// Only optimize Ands to prevent shrinking a constant that could be
	// matched by movzx.
	if (Opcode != ISD::AND)
	return false;

	// Make sure the RHS really is a constant.
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;

	const APInt &Mask = C->getAPIntValue();

	// Clear all non-demanded bits initially.
	APInt ShrunkMask = Mask & DemandedBits;

	// Find the width of the shrunk mask.
	unsigned Width = ShrunkMask.getActiveBits();

	// If the mask is all 0s there's nothing to do here.
	if (Width == 0)
	return false;

	// Find the next power of 2 width, rounding up to a byte.
	Width = PowerOf2Ceil(std::max(Width, 8U));
	// Truncate the width to size to handle illegal types.
	Width = std::min(Width, EltSize);

	// Calculate a possible zero extend mask for this constant.
	APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

	// If we aren't changing the mask, just return true to keep it and prevent
	// the caller from optimizing.
	if (ZeroExtendMask == Mask)
	return true;

	// Make sure the new mask can be represented by a combination of mask bits
	// and non-demanded bits.
	if (!ZeroExtendMask.isSubsetOf(Mask \| ~DemandedBits))
	return false;

	// Replace the constant with the zero extend mask.
	SDLoc DL(Op);
	SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
	Known = Known.anyextOrTrunc(BitWidth);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSRAI:
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= VT.getScalarSizeInBits()) {
	Known.setAllZero();
	break;
	}

	Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else if (Opc == X86ISD::VSRLI) {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	} else {
	Known.Zero.ashrInPlace(ShAmt);
	Known.One.ashrInPlace(ShAmt);
	}
	break;
	}
	case X86ISD::PACKUS: {
	// PACKUS is just a truncation if the upper half is zero.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	Known.One = APInt::getAllOnesValue(BitWidth * 2);
	Known.Zero = APInt::getAllOnesValue(BitWidth * 2);

	KnownBits Known2;
	if (!!DemandedLHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	if (!!DemandedRHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}

	if (Known.countMinLeadingZeros() < BitWidth)
	Known.resetAll();
	Known = Known.trunc(BitWidth);
	break;
	}
	case X86ISD::ANDNP: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// ANDNP = (~X & Y);
	Known.One &= Known2.Zero;
	Known.Zero \|= Known2.One;
	break;
	}
	case X86ISD::FOR: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	Known \|= Known2;
	break;
	}
	case X86ISD::PSADBW: {
	assert(VT.getScalarType() == MVT::i64 &&
	Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
	"Unexpected PSADBW types");

	// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
	Known.Zero.setBitsFrom(16);
	break;
	}
	case X86ISD::CMOV: {
	Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	case X86ISD::BEXTR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
	unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

	// If the length is 0, the result is 0.
	if (Length == 0) {
	Known.setAllZero();
	break;
	}

	if ((Shift + Length) <= BitWidth) {
	Known = DAG.computeKnownBits(Op0, Depth + 1);
	Known = Known.extractBits(Length, Shift);
	Known = Known.zextOrTrunc(BitWidth);
	}
	}
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P:
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::MCVTP2SI:
	case X86ISD::MCVTP2UI:
	case X86ISD::CVTTP2SI:
	case X86ISD::CVTTP2UI:
	case X86ISD::MCVTTP2SI:
	case X86ISD::MCVTTP2UI:
	case X86ISD::MCVTSI2P:
	case X86ISD::MCVTUI2P:
	case X86ISD::VFPROUND:
	case X86ISD::VMFPROUND:
	case X86ISD::CVTPS2PH:
	case X86ISD::MCVTPS2PH: {
	// Conversions - upper elements are known zero.
	EVT SrcVT = Op.getOperand(0).getValueType();
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	if (NumElts > NumSrcElts &&
	DemandedElts.countTrailingZeros() >= NumSrcElts)
	Known.setAllZero();
	}
	break;
	}
	case X86ISD::STRICT_CVTTP2SI:
	case X86ISD::STRICT_CVTTP2UI:
	case X86ISD::STRICT_CVTSI2P:
	case X86ISD::STRICT_CVTUI2P:
	case X86ISD::STRICT_VFPROUND:
	case X86ISD::STRICT_CVTPS2PH: {
	// Strict Conversions - upper elements are known zero.
	EVT SrcVT = Op.getOperand(1).getValueType();
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	if (NumElts > NumSrcElts &&
	DemandedElts.countTrailingZeros() >= NumSrcElts)
	Known.setAllZero();
	}
	break;
	}
	case X86ISD::MOVQ2DQ: {
	// Move from MMX to XMM. Upper half of XMM should be 0.
	if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
	Known.setAllZero();
	break;
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opc)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	Known.resetAll();
	break;
	} else if (M == SM_SentinelZero) {
	Known.One.clearAllBits();
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	Known.resetAll();
	break;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	// Known bits are the values that are shared by every demanded element.
	for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
	if (!DemandedOps[i])
	continue;
	KnownBits Known2 =
	DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VTRUNC: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
	DemandedRHS);

	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
	if (!!DemandedLHS)
	Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS)
	Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::ANDNP: {
	unsigned Tmp0 =
	DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
	return std::min(Tmp0, Tmp1);
	}

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opcode)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	return 1;
	} else if (M == SM_SentinelZero) {
	// Zero = all sign bits.
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	return 1;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	unsigned Tmp0 = VTBits;
	for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
	if (!DemandedOps[i])
	continue;
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
	Tmp0 = std::min(Tmp0, Tmp1);
	}
	return Tmp0;
	}
	}
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	// Helper to look for a normal load that can be narrowed into a vzload with the
	// specified VT and memory VT. Returns SDValue() on failure.
	static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
	SelectionDAG &DAG) {
	// Can't if the load is volatile or atomic.
	if (!LN->isSimple())
	return SDValue();

	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
	LN->getPointerInfo(), LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, unsigned &Shuffle,
	MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
	if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool MatchAny = true;
	bool MatchZero = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && (MatchAny \|\| MatchZero); ++i) {
	if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
	MatchAny = MatchZero = false;
	break;
	}
	MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
	MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (MatchAny \|\| MatchZero) {
	assert(MatchZero && "Failed to match zext but matched aext?");
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

	Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
	if (SrcVT.getVectorNumElements() != NumDstElts)
	Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
	bool ContainsZeros = isAnyZero(Mask);

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	narrowShuffleMaskElts(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
	ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	if (AllowIntDomain &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
	Mask, 0, Zeroable, Subtarget);
	if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() \|\| Subtarget.hasBWI() \|\|
	32 <= ShuffleVT.getScalarSizeInBits())) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	// Attempt to match against bit rotates.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasXOP()) \|\|
	Subtarget.hasAVX512())) {
	int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
	Subtarget, Mask);
	if (0 < RotateAmt) {
	Shuffle = X86ISD::VROTLI;
	PermuteImm = (unsigned)RotateAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
	if (((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) \|\|
	((MaskVT == MVT::v16i16 \|\| MaskVT == MVT::v32i8) && Subtarget.hasInt256()) \|\|
	((MaskVT == MVT::v32i16 \|\| MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
	if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
	Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteShuffle(
	MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
	const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against VALIGND/VALIGNQ rotate.
	if (AllowIntDomain && (EltSizeInBits == 64 \|\| EltSizeInBits == 32) &&
	((MaskVT.is128BitVector() && Subtarget.hasVLX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasVLX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (!isAnyZero(Mask)) {
	int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
	if (0 < Rotation) {
	Shuffle = X86ISD::VALIGN;
	if (EltSizeInBits == 64)
	ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
	else
	ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
	PermuteImm = Rotation;
	return true;
	}
	}
	}

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
	int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
	ForceV2Zero, BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS, but only if it has elements that need to
	// be set to zero.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector() && isAnyZero(Mask) &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
	PermuteImm, Mask, Zeroable)) {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector() &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}

	return false;
	}

	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget);

	/// Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask,
	bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	bool OptForSize = DAG.shouldOptForSize();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.isFloatingPoint() && Depth >= 1) \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	bool IsMaskedShuffle = false;
	if (RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
	if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
	Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
	IsMaskedShuffle = true;
	}
	}

	// If we are shuffling a broadcast (and not introducing zeros) then
	// we can just use the broadcast directly. This works for smaller broadcast
	// elements as well as they already repeat across each mask element
	if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
	(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
	return DAG.getBitcast(RootVT, V1);
	}

	// Attempt to match a subvector broadcast.
	// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
	if (UnaryShuffle &&
	(BaseMaskEltSizeInBits == 128 \|\| BaseMaskEltSizeInBits == 256)) {
	SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
	if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
	SDValue Src = Inputs[0];
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(0).isUndef() &&
	Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
	MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
	return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
	Src.getValueType(),
	Src.getOperand(1)));
	}
	}
	}

	// Handle 128/256-bit lane shuffles of 512-bit vectors.
	if (RootVT.is512BitVector() &&
	(NumBaseMaskElts == 2 \|\| NumBaseMaskElts == 4)) {
	MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

	// If the upper subvectors are zeroable, then an extract+insert is more
	// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
	// to zero the upper subvectors.
	if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
	if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
	return SDValue(); // Nothing to do!
	assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
	"Unexpected lane shuffle");
	Res = DAG.getBitcast(ShuffleVT, V1);
	unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
	bool UseZero = isAnyZero(BaseMask);
	Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
	Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
	return DAG.getBitcast(RootVT, Res);
	}

	// Narrow shuffle mask to v4x128.
	SmallVector<int, 4> Mask;
	assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
	narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);

	// Try to lower to vshuf64x2/vshuf32x4.
	auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG) {
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
	for (int i = 0; i < 4; ++i) {
	assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
	if (Mask[i] < 0)
	continue;

	SDValue Op = Mask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit
	// selection bits defined by a vshuf64x2 instruction's immediate control
	// byte.
	PermMask \|= (Mask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
	DAG.getBitcast(ShuffleVT, Ops[0]),
	DAG.getBitcast(ShuffleVT, Ops[1]),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	};

	// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
	// doesn't work because our mask is for 128 bits and we don't have an MVT
	// to match that.
	bool PreferPERMQ =
	UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
	isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
	isUndefOrInRange(Mask[3], 2, 4) &&
	(Mask[0] < 0 \|\| Mask[2] < 0 \|\| Mask[0] == (Mask[2] % 2)) &&
	(Mask[1] < 0 \|\| Mask[3] < 0 \|\| Mask[1] == (Mask[3] % 2));

	if (!isAnyZero(Mask) && !PreferPERMQ) {
	if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
	return DAG.getBitcast(RootVT, V);
	}
	}

	// Handle 128-bit lane shuffles of 256-bit vectors.
	if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);

	// If the upper half is zeroable, then an extract+insert is more optimal
	// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
	// zero the upper half.
	if (isUndefOrZero(BaseMask[1])) {
	if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
	return SDValue(); // Nothing to do!
	assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
	Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
	DL, 256);
	return DAG.getBitcast(RootVT, Res);
	}

	if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!

	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// Prefer blends for sequential shuffles unless we are optimizing for size.
	if (UnaryShuffle &&
	!(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
	(OptForSize \|\| !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
	return SDValue(); // Nothing to do!

	// TODO - handle AVX512VL cases with X86ISD::SHUF128.
	if (!UnaryShuffle && !IsMaskedShuffle) {
	assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
	"Unexpected shuffle sentinel value");
	// Prefer blends to X86ISD::VPERM2X128.
	if (!((BaseMask[0] == 0 && BaseMask[1] == 3) \|\|
	(BaseMask[0] == 2 && BaseMask[1] == 1))) {
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] & 3) << 0);
	PermMask \|= ((BaseMask[1] & 3) << 4);

	Res = DAG.getNode(
	X86ISD::VPERM2X128, DL, ShuffleVT,
	DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
	DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
	DAG.getTargetConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
	} else {
	Mask.assign(BaseMask.begin(), BaseMask.end());
	}

	// For masked shuffles, we're trying to match the root width for better
	// writemask folding, attempt to scale the mask.
	// TODO - variable shuffles might need this to be widened again.
	if (IsMaskedShuffle && NumRootElts > Mask.size()) {
	assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
	int MaskScale = NumRootElts / Mask.size();
	SmallVector<int, 64> ScaledMask;
	narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	// TODO: Should we indicate which domain is preferred if both are allowed?
	bool AllowFloatDomain = FloatDomain \|\| (Depth >= 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth >= 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt KnownUndef, KnownZero;
	resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
	APInt Zeroable = KnownUndef \| KnownZero;

	if (UnaryShuffle) {
	// Attempt to match against broadcast-from-vector.
	// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
	if ((Subtarget.hasAVX2() \|\|
	(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
	(!IsMaskedShuffle \|\| NumRootElts == NumMaskElts)) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	if (V1.getValueType() == MaskVT &&
	V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	MayFoldLoad(V1.getOperand(0))) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = V1.getOperand(0);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	if (Subtarget.hasAVX2()) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT) &&
	(!IsMaskedShuffle \|\|
	(NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
	PermuteImm) &&
	(!IsMaskedShuffle \|\|
	(NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Attempt to combine to INSERTPS, but only if the inserted element has come
	// from a scalar.
	// TODO: Handle other insertions here as well?
	if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
	MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
	!isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
	SDValue SrcV1 = V1, SrcV2 = V2;
	if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
	SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
	return SDValue(); // Nothing to do!
	Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, SrcV1),
	DAG.getBitcast(MVT::v4f32, SrcV2),
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT, UnaryShuffle) &&
	(!IsMaskedShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, NewV1, NewV2, DL, DAG,
	Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsMaskedShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 0 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	V2 = DAG.getBitcast(IntMaskVT, V2);
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getTargetConstant(BitLen, DL, MVT::i8),
	DAG.getTargetConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Match shuffle against TRUNCATE patterns.
	if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
	// Match against a VTRUNC instruction, accounting for src/dst sizes.
	if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
	Subtarget)) {
	bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
	ShuffleSrcVT.getVectorNumElements();
	unsigned Opc =
	IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
	if (Depth == 0 && Root.getOpcode() == Opc)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(ShuffleSrcVT, V1);
	Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
	if (ShuffleVT.getSizeInBits() < RootSizeInBits)
	Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
	return DAG.getBitcast(RootVT, Res);
	}

	// Do we need a more general binary truncation pattern?
	if (RootSizeInBits < 512 &&
	((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) \|\|
	(RootVT.is128BitVector() && Subtarget.hasVLX())) &&
	(MaskEltSizeInBits > 8 \|\| Subtarget.hasBWI()) &&
	isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
	if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
	return SDValue(); // Nothing to do!
	ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
	ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
	V1 = DAG.getBitcast(ShuffleSrcVT, V1);
	V2 = DAG.getBitcast(ShuffleSrcVT, V2);
	ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
	ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
	Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
	Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 1)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
	AllowVariableMask &= (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros = isAnyZero(Mask);

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	Res = DAG.getBitcast(MaskVT, V1);
	unsigned AndOpcode =
	MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	V2 = DAG.getBitcast(ByteVT, V2);
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input shuffle then lower to VPERMV3.
	if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v2i64 \|\| MaskVT == MVT::v4f64 \|\|
	MaskVT == MVT::v4i64 \|\| MaskVT == MVT::v4f32 \|\| MaskVT == MVT::v4i32 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i16)) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v16i8 \|\| MaskVT == MVT::v32i8)))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Combine an arbitrary chain of shuffles + extract_subvectors into a single
	// instruction if possible.
	//
	// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
	// type size to attempt to combine:
	// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
	// -->
	// extract_subvector(shuffle(x,y,m2),0)
	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumMaskElts = BaseMask.size();
	unsigned NumInputs = Inputs.size();
	if (NumInputs == 0)
	return SDValue();

	SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
	SmallVector<unsigned, 4> Offsets(NumInputs, 0);

	// Peek through subvectors.
	// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
	unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
	for (unsigned i = 0; i != NumInputs; ++i) {
	SDValue &Src = WideInputs[i];
	unsigned &Offset = Offsets[i];
	Src = peekThroughBitcasts(Src);
	EVT BaseVT = Src.getValueType();
	while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	Offset += Src.getConstantOperandVal(1);
	Src = Src.getOperand(0);
	}
	WideSizeInBits = std::max(WideSizeInBits,
	(unsigned)Src.getValueSizeInBits());
	assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
	"Unexpected subvector extraction");
	Offset /= BaseVT.getVectorNumElements();
	Offset *= NumMaskElts;
	}

	// Bail if we're always extracting from the lowest subvectors,
	// combineX86ShuffleChain should match this for the current width.
	if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
	return SDValue();

	EVT RootVT = Root.getValueType();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned Scale = WideSizeInBits / RootSizeInBits;
	assert((WideSizeInBits % RootSizeInBits) == 0 &&
	"Unexpected subvector extraction");

	// If the src vector types aren't the same, see if we can extend
	// them to match each other.
	// TODO: Support different scalar types?
	EVT WideSVT = WideInputs[0].getValueType().getScalarType();
	if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
	return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) \|\|
	Op.getValueType().getScalarType() != WideSVT;
	}))
	return SDValue();

	for (SDValue &NewInput : WideInputs) {
	assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
	"Shuffle vector size mismatch");
	if (WideSizeInBits > NewInput.getValueSizeInBits())
	NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
	SDLoc(NewInput), WideSizeInBits);
	assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
	"Unexpected subvector extraction");
	}

	// Create new mask for larger type.
	for (unsigned i = 1; i != NumInputs; ++i)
	Offsets[i] += i * Scale * NumMaskElts;

	SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
	for (int &M : WideMask) {
	if (M < 0)
	continue;
	M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
	}
	WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
	assert(!WideInputs.empty() && "Shuffle with no inputs detected");

	if (WideInputs.size() > 2)
	return SDValue();

	// Increase depth for every upper subvector we've peeked through.
	Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

	// Attempt to combine wider chain.
	// TODO: Can we use a better Root?
	SDValue WideRoot = WideInputs[0];
	if (SDValue WideShuffle = combineX86ShuffleChain(
	WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget)) {
	WideShuffle =
	extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
	return DAG.getBitcast(RootVT, WideShuffle);
	}
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	SDLoc DL(Root);
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Attempt to create a zero vector.
	if ((UndefElts \| ZeroElts).isAllOnesValue())
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	return DAG.getBitcast(VT, CstOp);
	}

	/// Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(RootMask.size() > 0 &&
	(RootMask.size() > 1 \|\| (RootMask[0] == 0 && SrcOpIndex == 0)) &&
	"Illegal shuffle root mask");

	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	const unsigned MaxRecursionDepth = 8;
	if (Depth >= MaxRecursionDepth)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
	assert(VT.getSizeInBits() == RootSizeInBits &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	// TODO - determine Op's demanded elts from RootMask.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	APInt OpUndef, OpZero;
	APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
	if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
	OpZero, DAG, Depth, false))
	return SDValue();

	// Shuffle inputs must be the same size as the result, bail on any larger
	// inputs and widen any smaller inputs.
	if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
	return Op.getValueSizeInBits() > RootSizeInBits;
	}))
	return SDValue();

	for (SDValue &Op : OpInputs)
	if (Op.getValueSizeInBits() < RootSizeInBits)
	Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
	SDLoc(Op), RootSizeInBits);

	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 16> Ops;

	// We don't need to merge masks if the root is empty.
	bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
	if (EmptyRoot) {
	// Only resolve zeros if it will remove an input, otherwise we might end
	// up in an infinite loop.
	bool ResolveKnownZeros = true;
	if (!OpZero.isNullValue()) {
	APInt UsedInputs = APInt::getNullValue(OpInputs.size());
	for (int i = 0, e = OpMask.size(); i != e; ++i) {
	int M = OpMask[i];
	if (OpUndef[i] \|\| OpZero[i] \|\| isUndefOrZero(M))
	continue;
	UsedInputs.setBit(M / OpMask.size());
	if (UsedInputs.isAllOnesValue()) {
	ResolveKnownZeros = false;
	break;
	}
	}
	}
	resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
	ResolveKnownZeros);

	Mask = OpMask;
	Ops.append(OpInputs.begin(), OpInputs.end());
	} else {
	resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

	// Add the inputs to the Ops list, avoiding duplicates.
	Ops.append(SrcOps.begin(), SrcOps.end());

	auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
	// Attempt to find an existing match.
	SDValue InputBC = peekThroughBitcasts(Input);
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (InputBC == peekThroughBitcasts(Ops[i]))
	return i;
	// Match failed - should we replace an existing Op?
	if (InsertionPoint >= 0) {
	Ops[InsertionPoint] = Input;
	return InsertionPoint;
	}
	// Add to the end of the Ops list.
	Ops.push_back(Input);
	return Ops.size() - 1;
	};

	SmallVector<int, 2> OpInputIdx;
	for (SDValue OpInput : OpInputs)
	OpInputIdx.push_back(
	AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) &&
	"Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio =
	std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	Mask.resize(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by
	// the root mask to get us all the way to the root value arrangement. The
	// reason for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) +
	(RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
	assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
	OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

	Mask[i] = OpMaskedIdx;
	}
	}

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	assert(!Ops.empty() && "Shuffle with no inputs detected");
	HasVariableMask \|= IsOpVariableMask;

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be generally combined if it either has
	// a single use (i.e. current Op) or all its users have already been combined,
	// if not then we can still combine but should prevent generation of variable
	// shuffles to avoid constant pool bloat.
	// Don't recurse if we already have more source ops than we can combine in
	// the remaining recursion depth.
	if (Ops.size() < (MaxRecursionDepth - Depth)) {
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	// For empty roots, we need to resolve zeroable elements before combining
	// them with other shuffles.
	SmallVector<int, 64> ResolvedMask = Mask;
	if (EmptyRoot)
	resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
	bool AllowVar = false;
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	AllowVar = AllowVariableMask;
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
	HasVariableMask, AllowVar, DAG, Subtarget))
	return Res;
	}
	}

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() <= 2) {
	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget);
	}

	// If that failed and any input is extracted then try to combine as a
	// shuffle with the larger type.
	return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
	HasVariableMask, AllowVariableMask,
	DAG, Subtarget);
	}

	/// Helper entry wrapper to combineX86ShufflesRecursively.
	static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /Depth/ 0,
	/HasVarMask/ false,
	/AllowVarMask/ true, DAG, Subtarget);
	}

	/// Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	// Attempt to commute shufps LHS loads:
	// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
	static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
	SelectionDAG &DAG) {
	// TODO: Add vXf64 support.
	if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
	return SDValue();

	// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
	auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
	if (V.getOpcode() != X86ISD::SHUFP \|\| !Parent->isOnlyUserOf(V.getNode()))
	return SDValue();
	SDValue N0 = V.getOperand(0);
	SDValue N1 = V.getOperand(1);
	unsigned Imm = V.getConstantOperandVal(2);
	if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) \|\|
	MayFoldLoad(peekThroughOneUseBitcasts(N1)))
	return SDValue();
	Imm = ((Imm & 0x0F) << 4) \| ((Imm & 0xF0) >> 4);
	return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
	DAG.getTargetConstant(Imm, DL, MVT::i8));
	};

	switch (N.getOpcode()) {
	case X86ISD::VPERMILPI:
	if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
	unsigned Imm = N.getConstantOperandVal(1);
	return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
	DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
	}
	break;
	case X86ISD::SHUFP: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned Imm = N.getConstantOperandVal(2);
	if (N0 == N1) {
	if (SDValue NewSHUFP = commuteSHUFP(N, N0))
	return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
	DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
	} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
	return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
	DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
	} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
	return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
	DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
	}
	break;
	}
	}

	return SDValue();
	}

	/// Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	bool IsUnary;
	SmallVector<int, 64> TargetMask;
	SmallVector<SDValue, 2> TargetOps;
	if (isTargetShuffle(Opcode))
	getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
	// represents the LHS/RHS inputs for the lower/upper halves.
	SmallVector<int, 16> TargetMask128;
	if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
	isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
	SmallVector<int, 16> WidenedMask128 = TargetMask128;
	while (WidenedMask128.size() > 2) {
	SmallVector<int, 16> WidenedMask;
	if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
	break;
	WidenedMask128 = std::move(WidenedMask);
	}
	if (WidenedMask128.size() == 2) {
	assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
	SDValue BC0 = peekThroughBitcasts(TargetOps.front());
	SDValue BC1 = peekThroughBitcasts(TargetOps.back());
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	bool isHoriz = (Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB);
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(isHoriz \|\| Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	bool SingleOp = (TargetOps.size() == 1);
	if (!isHoriz \|\| shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
	SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
	SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
	Lo = Lo.getOperand(WidenedMask128[0] & 1);
	Hi = Hi.getOperand(WidenedMask128[1] & 1);
	if (SingleOp) {
	MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
	SDValue Undef = DAG.getUNDEF(SrcVT);
	SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
	Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
	Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
	Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
	Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	return DAG.getBitcast(VT, Horiz);
	}
	}
	}
	}

	if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
	return R;

	// Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
	// help expose the 'NOT' pattern further up the DAG.
	// TODO: This might be beneficial for any binop with a 'splattable' operand.
	switch (Opcode) {
	case X86ISD::MOVDDUP:
	case X86ISD::PSHUFD: {
	SDValue Src = N.getOperand(0);
	if (Src.hasOneUse() && Src.getValueType() == VT) {
	if (SDValue Not = IsNOT(Src, DAG, /OneUse/ true)) {
	Not = DAG.getBitcast(VT, Not);
	Not = Opcode == X86ISD::MOVDDUP
	? DAG.getNode(Opcode, DL, VT, Not)
	: DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
	EVT IntVT = Not.getValueType().changeTypeToInteger();
	SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
	Not = DAG.getBitcast(IntVT, Not);
	Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
	return DAG.getBitcast(VT, Not);
	}
	}
	break;
	}
	}

	// Handle specific target shuffles.
	switch (Opcode) {
	case X86ISD::MOVDDUP: {
	SDValue Src = N.getOperand(0);
	// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
	if (VT == MVT::v2f64 && Src.hasOneUse() &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
	SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
	DCI.CombineTo(N.getNode(), Movddup);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N; // Return N so it doesn't get rechecked!
	}
	}

	return SDValue();
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	SDValue BC = peekThroughBitcasts(Src);
	EVT SrcVT = Src.getValueType();
	EVT BCVT = BC.getValueType();

	// If broadcasting from another shuffle, attempt to simplify it.
	// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
	if (isTargetShuffle(BC.getOpcode()) &&
	VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
	unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
	SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
	SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i)
	DemandedMask[i] = i;
	if (SDValue Res = combineX86ShufflesRecursively(
	{BC}, 0, BC, DemandedMask, {}, /Depth/ 0,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getBitcast(SrcVT, Res));
	}

	// broadcast(bitcast(src)) -> bitcast(broadcast(src))
	// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
	if (Src.getOpcode() == ISD::BITCAST &&
	SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
	DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
	}

	// Reduce broadcast source vector to lowest 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	extract128BitVector(Src, 0, DAG, DL));

	// broadcast(scalar_to_vector(x)) -> broadcast(x).
	if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

	// Share broadcast with the longest vector and extract low subvector (free).
	for (SDNode *User : Src->uses())
	if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	return extractSubVector(SDValue(User, 0), 0, DAG, DL,
	VT.getSizeInBits());
	}

	// vbroadcast(scalarload X) -> vbroadcast_load X
	// For float loads, extract other uses of the scalar from the broadcast.
	if (!SrcVT.isVector() && (Src.hasOneUse() \|\| VT.isFloatingPoint()) &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceExtract = Src.hasOneUse();
	DCI.CombineTo(N.getNode(), BcastLd);
	if (NoReplaceExtract) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	} else {
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
	DAG.getIntPtrConstant(0, DL));
	DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
	}
	return N; // Return N so it doesn't get rechecked!
	}

	// Due to isTypeDesirableForOp, we won't always shrink a load truncated to
	// i16. So shrink it ourselves if we can make a broadcast_load.
	if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
	Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
	assert(Subtarget.hasAVX2() && "Expected AVX2");
	SDValue TruncIn = Src.getOperand(0);

	// If this is a truncate of a non extending load we can just narrow it to
	// use a broadcast_load.
	if (ISD::isNormalLoad(TruncIn.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
	// Unless its volatile or atomic.
	if (LN->isSimple()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
	LN->getPointerInfo(), LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(Src.getNode());
	return N; // Return N so it doesn't get rechecked!
	}
	}

	// If this is a truncate of an i16 extload, we can directly replace it.
	if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
	ISD::isEXTLoad(Src.getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
	if (LN->getMemoryVT().getSizeInBits() == 16) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(Src.getNode());
	return N; // Return N so it doesn't get rechecked!
	}
	}

	// If this is a truncate of load that has been shifted right, we can
	// offset the pointer and use a narrower load.
	if (TruncIn.getOpcode() == ISD::SRL &&
	TruncIn.getOperand(0).hasOneUse() &&
	isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
	ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
	unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
	// Make sure the shift amount and the load size are divisible by 16.
	// Don't do this if the load is volatile or atomic.
	if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
	LN->isSimple()) {
	unsigned Offset = ShiftAmt / 8;
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
	SDValue Ops[] = { LN->getChain(), Ptr };
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
	LN->getPointerInfo().getWithOffset(Offset),
	LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(Src.getNode());
	return N; // Return N so it doesn't get rechecked!
	}
	}
	}

	// vbroadcast(vzload X) -> vbroadcast_load X
	if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
	MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
	if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N; // Return N so it doesn't get rechecked!
	}
	}

	// vbroadcast(vector load X) -> vbroadcast_load
	if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
	ISD::isNormalLoad(Src.getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(Src);
	// Unless the load is volatile or atomic.
	if (LN->isSimple()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
	LN->getPointerInfo(), LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DCI.CombineTo(N.getNode(), BcastLd);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N; // Return N so it doesn't get rechecked!
	}
	}

	return SDValue();
	}
	case X86ISD::VZEXT_MOVL: {
	SDValue N0 = N.getOperand(0);

	// If this a vzmovl of a full vector load, replace it with a vzload, unless
	// the load is volatile.
	if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
	auto *LN = cast<LoadSDNode>(N0);
	if (SDValue VZLoad =
	narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
	DCI.CombineTo(N.getNode(), VZLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N;
	}
	}

	// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
	// and can just use a VZEXT_LOAD.
	// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
	if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *LN = cast<MemSDNode>(N0);
	if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
	LN->getMemoryVT(), LN->getMemOperand());
	DCI.CombineTo(N.getNode(), VZLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return N;
	}
	}

	// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
	// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
	// if the upper bits of the i64 are zero.
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	N0.getOperand(0).hasOneUse() &&
	N0.getOperand(0).getValueType() == MVT::i64) {
	SDValue In = N0.getOperand(0);
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(In, Mask)) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
	MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
	return DAG.getBitcast(VT, Movl);
	}
	}

	// Load a scalar integer constant directly to XMM instead of transferring an
	// immediate value from GPR.
	// vzext_movl (scalar_to_vector C) --> load [C,0...]
	if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
	// Create a vector constant - scalar constant followed by zeros.
	EVT ScalarVT = N0.getOperand(0).getValueType();
	Type ScalarTy = ScalarVT.getTypeForEVT(DAG.getContext());
	unsigned NumElts = VT.getVectorNumElements();
	Constant *Zero = ConstantInt::getNullValue(ScalarTy);
	SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
	ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

	// Load the vector constant from constant pool.
	MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
	MachinePointerInfo MPI =
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
	Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
	return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
	MachineMemOperand::MOLoad);
	}
	}

	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
	// TODO: Handle MVT::v16i16 repeated blend mask.
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
	MVT SrcVT = N0.getOperand(0).getSimpleValueType();
	if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
	SrcVT.getScalarSizeInBits() >= 32) {
	unsigned BlendMask = N.getConstantOperandVal(2);
	unsigned Size = VT.getVectorNumElements();
	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
	N1.getOperand(0),
	DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
	}
	}
	return SDValue();
	}
	case X86ISD::VPERMI: {
	// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
	// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
	return DAG.getBitcast(VT, Res);
	}
	return SDValue();
	}
	case X86ISD::VPERM2X128: {
	// If both 128-bit values were inserted into high halves of 256-bit values,
	// the shuffle can be reduced to a concatenation of subvectors:
	// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
	// Note: We are only looking for the exact high/high shuffle mask because we
	// expect to fold other similar patterns before creating this opcode.
	SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
	SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
	unsigned Imm = N.getConstantOperandVal(2);
	if (!(Imm == 0x31 &&
	Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Ins0.getValueType() == Ins1.getValueType()))
	return SDValue();

	SDValue X = Ins0.getOperand(1);
	SDValue Y = Ins1.getOperand(1);
	unsigned C1 = Ins0.getConstantOperandVal(2);
	unsigned C2 = Ins1.getConstantOperandVal(2);
	MVT SrcVT = X.getSimpleValueType();
	unsigned SrcElts = SrcVT.getVectorNumElements();
	if (SrcVT != Y.getSimpleValueType() \|\| SrcVT.getSizeInBits() != 128 \|\|
	C1 != SrcElts \|\| C2 != SrcElts)
	return SDValue();

	return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
	Ins1.getValueType(), X, Y));
	}
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// Canonicalize scalar FPOps:
	// MOVS(N0, OP(N0, N1)) --> MOVS(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
	// If commutable, allow OP(N1[0], N0[0]).
	unsigned Opcode1 = N1.getOpcode();
	if (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL \|\| Opcode1 == ISD::FSUB \|\|
	Opcode1 == ISD::FDIV) {
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);
	if (N10 == N0 \|\|
	(N11 == N0 && (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL))) {
	if (N10 != N0)
	std::swap(N10, N11);
	MVT SVT = VT.getVectorElementType();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
	N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
	SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	return DAG.getNode(Opcode, DL, VT, N0, SclVec);
	}
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	unsigned InsertPSMask = N.getConstantOperandVal(2);
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	APInt KnownUndef1, KnownZero1;
	if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
	KnownZero1)) {
	if (KnownUndef1[SrcIdx] \|\| KnownZero1[SrcIdx]) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	int M = TargetMask1[SrcIdx];
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	APInt KnownUndef0, KnownZero0;
	if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
	KnownZero0)) {
	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (KnownUndef0[i] \|\| KnownZero0[i]) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	int M = TargetMask0[i];
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
	}

	// If we're inserting an element from a vbroadcast load, fold the
	// load into the X86insertps instruction. We need to convert the scalar
	// load to a vector and clear the source lane of the INSERTPS control.
	if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
	if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
	SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
	MemIntr->getBasePtr(),
	MemIntr->getMemOperand());
	SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
	Load),
	DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
	return Insert;
	}
	}

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return N.getOperand(0);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse() && V.getOperand(0).hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Checks if the shuffle mask takes subsequent elements
	/// alternately from two vectors.
	/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
	static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

	int ParitySrc[2] = {-1, -1};
	unsigned Size = Mask.size();
	for (unsigned i = 0; i != Size; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Make sure we are using the matching element from the input.
	if ((M % Size) != i)
	return false;

	// Make sure we use the same input for all elements of the same parity.
	int Src = M / Size;
	if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
	return false;
	ParitySrc[i % 2] = Src;
	}

	// Make sure each input is used.
	if (ParitySrc[0] < 0 \|\| ParitySrc[1] < 0 \|\| ParitySrc[0] == ParitySrc[1])
	return false;

	Op0Even = ParitySrc[0] == 0;
	return true;
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
	bool &IsSubAdd) {

	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasSSE3() \|\| !TLI.isTypeLegal(VT) \|\|
	!VT.getSimpleVT().isFloatingPoint())
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// Make sure we have an FADD and an FSUB.
	if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) \|\|
	(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) \|\|
	V1.getOpcode() == V2.getOpcode())
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS, RHS;
	if (V1.getOpcode() == ISD::FSUB) {
	LHS = V1->getOperand(0); RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;
	} else {
	assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
	LHS = V2->getOperand(0); RHS = V2->getOperand(1);
	if ((V1->getOperand(0) != LHS \|\| V1->getOperand(1) != RHS) &&
	(V1->getOperand(0) != RHS \|\| V1->getOperand(1) != LHS))
	return false;
	}

	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return false;

	// It's a subadd if the vector in the even parity is an FADD.
	IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
	: V2->getOpcode() == ISD::FADD;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
	static SDValue combineShuffleToFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasAnyFMA() \|\| !TLI.isTypeLegal(VT))
	return SDValue();

	// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDValue FMAdd = Op0, FMSub = Op1;
	if (FMSub.getOpcode() != X86ISD::FMSUB)
	std::swap(FMAdd, FMSub);

	if (FMAdd.getOpcode() != ISD::FMA \|\| FMSub.getOpcode() != X86ISD::FMSUB \|\|
	FMAdd.getOperand(0) != FMSub.getOperand(0) \|\| !FMAdd.hasOneUse() \|\|
	FMAdd.getOperand(1) != FMSub.getOperand(1) \|\| !FMSub.hasOneUse() \|\|
	FMAdd.getOperand(2) != FMSub.getOperand(2))
	return SDValue();

	// Check for correct shuffle mask.
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return SDValue();

	// FMAddSub takes zeroth operand from FMSub node.
	SDLoc DL(N);
	bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
	unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
	FMAdd.getOperand(2));
	}

	/// Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
	return V;

	SDValue Opnd0, Opnd1;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
	if (Opcode != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	// For a broadcast, peek through an extract element of index 0 to find the
	// horizontal op: broadcast (ext_vec_elt HOp, 0)
	EVT VT = N->getValueType(0);
	if (Opcode == X86ISD::VBROADCAST) {
	SDValue SrcOp = N->getOperand(0);
	if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	SrcOp.getValueType() == MVT::f64 &&
	SrcOp.getOperand(0).getValueType() == VT &&
	isNullConstant(SrcOp.getOperand(1)))
	N = SrcOp.getNode();
	}

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
	HOp.getOperand(0) != HOp.getOperand(1))
	return SDValue();

	// The shuffle that we are eliminating may have allowed the horizontal op to
	// have an undemanded (undefined) operand. Duplicate the other (defined)
	// operand to ensure that the results are defined across all lanes without the
	// shuffle.
	auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
	SDValue X;
	if (HorizOp.getOperand(0).isUndef()) {
	assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
	X = HorizOp.getOperand(1);
	} else if (HorizOp.getOperand(1).isUndef()) {
	assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
	X = HorizOp.getOperand(0);
	} else {
	return HorizOp;
	}
	return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
	HorizOp.getValueType(), X, X);
	};

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If a target shuffle is also
	// replicating low and high halves (and without changing the type/length of
	// the vector), we don't need the shuffle.
	if (Opcode == X86ISD::MOVDDUP \|\| Opcode == X86ISD::VBROADCAST) {
	if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
	// movddup (hadd X, X) --> hadd X, X
	// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
	assert((HOp.getValueType() == MVT::v2f64 \|\|
	HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
	return updateHOp(HOp, DAG);
	}
	return SDValue();
	}

	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (HOp.getValueSizeInBits() == 128 &&
	(isTargetShuffleEquivalent(Mask, {0, 0}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
	return updateHOp(HOp, DAG);

	if (HOp.getValueSizeInBits() == 256 &&
	(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) \|\|
	isTargetShuffleEquivalent(
	Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
	return updateHOp(HOp, DAG);

	return SDValue();
	}

	/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
	/// low half of each source vector and does not set any high half elements in
	/// the destination vector, narrow the shuffle to half its original size.
	static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
	if (!Shuf->getValueType(0).isSimple())
	return SDValue();
	MVT VT = Shuf->getSimpleValueType(0);
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// See if we can ignore all of the high elements of the shuffle.
	ArrayRef<int> Mask = Shuf->getMask();
	if (!isUndefUpperHalf(Mask))
	return SDValue();

	// Check if the shuffle mask accesses only the low half of each input vector
	// (half-index output is 0 or 2).
	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(Mask.size() / 2);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) \|\|
	(HalfIdx1 % 2 == 1) \|\| (HalfIdx2 % 2 == 1))
	return SDValue();

	// Create a half-width shuffle to replace the unnecessarily wide shuffle.
	// The trick is knowing that all of the insert/extract are actually free
	// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
	// of narrow inputs into a narrow output, and that is always cheaper than
	// the wide shuffle that we started with.
	return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
	Shuf->getOperand(1), HalfMask, HalfIdx1,
	HalfIdx2, false, DAG, /UseConcat/true);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
	if (SDValue V = narrowShuffle(Shuf, DAG))
	return V;

	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
	return HAddSub;
	}

	// Attempt to combine into a vector load/broadcast.
	if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
	Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	// Simplify source operands based on shuffle mask.
	// TODO - merge this into combineX86ShufflesRecursively.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
	return SDValue(N, 0);
	}

	// Pull subvector inserts into undef through VZEXT_MOVL by making it an
	// insert into a zero vector. This helps get VZEXT_MOVL closer to
	// scalar_to_vectors where 256/512 are canonicalized to an insert and a
	// 128-bit scalar_to_vector. This reduces the number of isel patterns.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
	N->getOperand(0).hasOneUse()) {
	SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));

	if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
	V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
	SDValue In = V.getOperand(1);
	MVT SubVT =
	MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	In.getValueSizeInBits() / VT.getScalarSizeInBits());
	In = DAG.getBitcast(SubVT, In);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
	getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
	Movl, V.getOperand(2));
	}
	}

	return SDValue();
	}

	// Simplify variable target shuffle masks based on the demanded elements.
	// TODO: Handle DemandedBits in mask indices as well?
	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
	SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
	TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
	// If we're demanding all elements don't bother trying to simplify the mask.
	unsigned NumElts = DemandedElts.getBitWidth();
	if (DemandedElts.isAllOnesValue())
	return false;

	SDValue Mask = Op.getOperand(MaskIndex);
	if (!Mask.hasOneUse())
	return false;

	// Attempt to generically simplify the variable shuffle mask.
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;

	// Attempt to extract+simplify a (constant pool load) shuffle mask.
	// TODO: Support other types from getTargetShuffleMaskIndices?
	SDValue BC = peekThroughOneUseBitcasts(Mask);
	EVT BCVT = BC.getValueType();
	auto *Load = dyn_cast<LoadSDNode>(BC);
	if (!Load)
	return false;

	const Constant *C = getTargetConstantFromNode(Load);
	if (!C)
	return false;

	Type *CTy = C->getType();
	if (!CTy->isVectorTy() \|\|
	CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
	return false;

	// Handle scaling for i64 elements on 32-bit targets.
	unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
	if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
	return false;
	unsigned Scale = NumCstElts / NumElts;

	// Simplify mask if we have an undemanded element that is not undef.
	bool Simplified = false;
	SmallVector<Constant *, 32> ConstVecOps;
	for (unsigned i = 0; i != NumCstElts; ++i) {
	Constant *Elt = C->getAggregateElement(i);
	if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
	ConstVecOps.push_back(UndefValue::get(Elt->getType()));
	Simplified = true;
	continue;
	}
	ConstVecOps.push_back(Elt);
	}
	if (!Simplified)
	return false;

	// Generate new constant pool entry + legalize immediately for the load.
	SDLoc DL(Op);
	SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
	SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
	SDValue NewMask = TLO.DAG.getLoad(
	BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
	MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
	Load->getAlign());
	return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
	}

	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	// Handle special case opcodes.
	switch (Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	APInt LHSUndef, LHSZero;
	APInt RHSUndef, RHSZero;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
	Depth + 1))
	return true;
	// Multiply by zero.
	KnownZero = LHSZero \| RHSZero;
	break;
	}
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA: {
	// We only need the bottom 64-bits of the (128-bit) shift amount.
	SDValue Amt = Op.getOperand(1);
	MVT AmtVT = Amt.getSimpleValueType();
	assert(AmtVT.is128BitVector() && "Unexpected value type");

	// If we reuse the shift amount just for sse shift amounts then we know that
	// only the bottom 64-bits are only ever used.
	bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
	unsigned UseOpc = Use->getOpcode();
	return (UseOpc == X86ISD::VSHL \|\| UseOpc == X86ISD::VSRL \|\|
	UseOpc == X86ISD::VSRA) &&
	Use->getOperand(0) != Amt;
	});

	APInt AmtUndef, AmtZero;
	unsigned NumAmtElts = AmtVT.getVectorNumElements();
	APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
	if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
	Depth + 1, AssumeSingleUse))
	return true;
	LLVM_FALLTHROUGH;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt SrcUndef;
	if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
	Depth + 1))
	return true;
	// TODO convert SrcUndef to KnownUndef.
	break;
	}
	case X86ISD::KSHIFTL: {
	SDValue Src = Op.getOperand(0);
	auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
	assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
	unsigned ShiftAmt = Amt->getZExtValue();

	if (ShiftAmt == 0)
	return TLO.CombineTo(Op, Src);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Src.getOpcode() == X86ISD::KSHIFTR) {
	if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
	unsigned C1 = Src.getConstantOperandVal(1);
	unsigned NewOpc = X86ISD::KSHIFTL;
	int Diff = ShiftAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	NewOpc = X86ISD::KSHIFTR;
	}

	SDLoc dl(Op);
	SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
	}
	}

	APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;

	KnownUndef <<= ShiftAmt;
	KnownZero <<= ShiftAmt;
	KnownZero.setLowBits(ShiftAmt);
	break;
	}
	case X86ISD::KSHIFTR: {
	SDValue Src = Op.getOperand(0);
	auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
	assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
	unsigned ShiftAmt = Amt->getZExtValue();

	if (ShiftAmt == 0)
	return TLO.CombineTo(Op, Src);

	// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
	// single shift. We can do this if the top bits (which are shifted
	// out) are never demanded.
	if (Src.getOpcode() == X86ISD::KSHIFTL) {
	if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
	unsigned C1 = Src.getConstantOperandVal(1);
	unsigned NewOpc = X86ISD::KSHIFTR;
	int Diff = ShiftAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	NewOpc = X86ISD::KSHIFTL;
	}

	SDLoc dl(Op);
	SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
	}
	}

	APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;

	KnownUndef.lshrInPlace(ShiftAmt);
	KnownZero.lshrInPlace(ShiftAmt);
	KnownZero.setHighBits(ShiftAmt);
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt SrcUndef, SrcZero;
	APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;

	// Aggressively peek through ops to get at the demanded elts.
	// TODO - we should do this for all target/faux shuffles ops.
	if (!DemandedElts.isAllOnesValue()) {
	SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
	TLO.DAG, Depth + 1);
	SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
	TLO.DAG, Depth + 1);
	if (NewN0 \|\| NewN1) {
	NewN0 = NewN0 ? NewN0 : N0;
	NewN1 = NewN1 ? NewN1 : N1;
	return TLO.CombineTo(Op,
	TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
	}
	}
	break;
	}
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	APInt DemandedLHS, DemandedRHS;
	getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;
	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownZero = SrcZero.zextOrTrunc(NumElts);
	KnownUndef = SrcUndef.zextOrTrunc(NumElts);
	break;
	}
	case X86ISD::BLENDV: {
	APInt SelUndef, SelZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
	SelZero, TLO, Depth + 1))
	return true;

	// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;

	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;

	KnownZero = LHSZero & RHSZero;
	KnownUndef = LHSUndef & RHSUndef;
	break;
	}
	case X86ISD::VZEXT_MOVL: {
	// If upper demanded elements are already zero then we have nothing to do.
	SDValue Src = Op.getOperand(0);
	APInt DemandedUpperElts = DemandedElts;
	DemandedUpperElts.clearLowBits(1);
	if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
	return TLO.CombineTo(Op, Src);
	break;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;
	// Don't bother broadcasting if we just need the 0'th element.
	if (DemandedElts == 1) {
	if (Src.getValueType() != VT)
	Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
	SDLoc(Op));
	return TLO.CombineTo(Op, Src);
	}
	APInt SrcUndef, SrcZero;
	APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	// Aggressively peek through src to get at the demanded elt.
	// TODO - we should do this for all target/faux shuffles ops.
	if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
	Src, SrcElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
	break;
	}
	case X86ISD::VPERMV:
	if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
	Depth))
	return true;
	break;
	case X86ISD::PSHUFB:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMILPV:
	if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
	Depth))
	return true;
	break;
	case X86ISD::VPPERM:
	case X86ISD::VPERMIL2:
	if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
	Depth))
	return true;
	break;
	}

	// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
	// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
	// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	DemandedElts.lshr(NumElts / 2) == 0) {
	unsigned SizeInBits = VT.getSizeInBits();
	unsigned ExtSizeInBits = SizeInBits / 2;

	// See if 512-bit ops only use the bottom 128-bits.
	if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
	ExtSizeInBits = SizeInBits / 4;

	switch (Opc) {
	// Subvector broadcast.
	case X86ISD::SUBV_BROADCAST: {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	if (Src.getValueSizeInBits() > ExtSizeInBits)
	Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
	else if (Src.getValueSizeInBits() < ExtSizeInBits) {
	MVT SrcSVT = Src.getSimpleValueType().getScalarType();
	MVT SrcVT =
	MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
	Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
	}
	return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
	TLO.DAG, DL, ExtSizeInBits));
	}
	// Byte shifts by immediate.
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	// Shift by uniform.
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA:
	// Shift by immediate.
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	case X86ISD::VPERMI: {
	// Simplify PERMPD/PERMQ to extract_subvector.
	// TODO: This should be done in shuffle combining.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64) {
	SmallVector<int, 4> Mask;
	DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
	if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
	SDLoc DL(Op);
	SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
	return TLO.CombineTo(Op, Insert);
	}
	}
	break;
	}
	// Zero upper elements.
	case X86ISD::VZEXT_MOVL:
	// Target unary shuffles by immediate:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	case X86ISD::VPERMILPI:
	// (Non-Lane Crossing) Target Shuffles.
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::PSHUFB:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::BLENDI:
	// Saturated Packs.
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	// Horizontal Ops.
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	SDLoc DL(Op);
	SmallVector<SDValue, 4> Ops;
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	SDValue SrcOp = Op.getOperand(i);
	EVT SrcVT = SrcOp.getValueType();
	assert((!SrcVT.isVector() \|\| SrcVT.getSizeInBits() == SizeInBits) &&
	"Unsupported vector size");
	Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
	ExtSizeInBits)
	: SrcOp);
	}
	MVT ExtVT = VT.getSimpleVT();
	ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
	ExtSizeInBits / ExtVT.getScalarSizeInBits());
	SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	}
	}

	// Get target/faux shuffle mask.
	APInt OpUndef, OpZero;
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
	OpZero, TLO.DAG, Depth, false))
	return false;

	// Shuffle inputs must be the same size as the result.
	if (OpMask.size() != (unsigned)NumElts \|\|
	llvm::any_of(OpInputs, [VT](SDValue V) {
	return VT.getSizeInBits() != V.getValueSizeInBits() \|\|
	!V.getValueType().isVector();
	}))
	return false;

	KnownZero = OpZero;
	KnownUndef = OpUndef;

	// Check if shuffle mask can be simplified to undef/zero/identity.
	int NumSrcs = OpInputs.size();
	for (int i = 0; i != NumElts; ++i)
	if (!DemandedElts[i])
	OpMask[i] = SM_SentinelUndef;

	if (isUndefInRange(OpMask, 0, NumElts)) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}
	if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
	KnownZero.setAllBits();
	return TLO.CombineTo(
	Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
	}
	for (int Src = 0; Src != NumSrcs; ++Src)
	if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

	// Attempt to simplify inputs.
	for (int Src = 0; Src != NumSrcs; ++Src) {
	// TODO: Support inputs of different types.
	if (OpInputs[Src].getValueType() != VT)
	continue;

	int Lo = Src * NumElts;
	APInt SrcElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	int M = OpMask[i] - Lo;
	if (0 <= M && M < NumElts)
	SrcElts.setBit(M);
	}

	// TODO - Propagate input undef/zero elts.
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;
	}

	// If we don't demand all elements, then attempt to combine to a simpler
	// shuffle.
	// TODO: Handle other depths, but first we need to handle the fact that
	// it might combine to the same shuffle.
	if (!DemandedElts.isAllOnesValue() && Depth == 0) {
	SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	DemandedMask[i] = i;

	SDValue NewShuffle = combineX86ShufflesRecursively(
	{Op}, 0, Op, DemandedMask, {}, Depth, /HasVarMask/ false,
	/AllowVarMask/ true, TLO.DAG, Subtarget);
	if (NewShuffle)
	return TLO.CombineTo(Op, NewShuffle);
	}

	return false;
	}

	bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
	SDValue Op, const APInt &OriginalDemandedBits,
	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = OriginalDemandedBits.getBitWidth();
	unsigned Opc = Op.getOpcode();
	switch(Opc) {
	case X86ISD::VTRUNC: {
	KnownBits KnownOp;
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// Simplify the input, using demanded bit information.
	APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
	APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	KnownBits KnownOp;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// FIXME: Can we bound this better?
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;

	// Aggressively peek through ops to get at the demanded low bits.
	SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
	LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
	SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
	RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
	if (DemandedLHS \|\| DemandedRHS) {
	DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
	DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
	}
	break;
	}
	case X86ISD::VSHLI: {
	SDValue Op0 = Op.getOperand(0);

	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Op0.getOpcode() == X86ISD::VSRLI &&
	OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
	unsigned Shift2Amt = Op0.getConstantOperandVal(1);
	if (Shift2Amt < BitWidth) {
	int Diff = ShAmt - Shift2Amt;
	if (Diff == 0)
	return TLO.CombineTo(Op, Op0.getOperand(0));

	unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
	SDValue NewShift = TLO.DAG.getNode(
	NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
	TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
	return TLO.CombineTo(Op, NewShift);
	}
	}

	// If we are only demanding sign bits then we can use the shift source directly.
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
	unsigned UpperDemandedBits =
	BitWidth - OriginalDemandedBits.countTrailingZeros();
	if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
	return TLO.CombineTo(Op, Op0);

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;

	// Low bits known zero.
	Known.Zero.setLowBits(ShAmt);
	break;
	}
	case X86ISD::VSRLI: {
	unsigned ShAmt = Op.getConstantOperandVal(1);
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
	OriginalDemandedElts, Known, TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// High bits known zero.
	Known.Zero.setHighBits(ShAmt);
	break;
	}
	case X86ISD::VSRAI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
	if (ShAmt >= BitWidth)
	break;

	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	// If we just want the sign bit then we don't need to shift it.
	if (OriginalDemandedBits.isSignMask())
	return TLO.CombineTo(Op, Op0);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Op0.getOpcode() == X86ISD::VSHLI &&
	Op.getOperand(1) == Op0.getOperand(1)) {
	SDValue Op00 = Op0.getOperand(0);
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
	if (ShAmt < NumSignBits)
	return TLO.CombineTo(Op, Op00);
	}

	// If any of the demanded bits are produced by the sign extension, we also
	// demand the input sign bit.
	if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
	DemandedMask.setSignBit();

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	if (Known.Zero[BitWidth - ShAmt - 1] \|\|
	OriginalDemandedBits.countLeadingZeros() >= ShAmt)
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

	// High bits are known one.
	if (Known.One[BitWidth - ShAmt - 1])
	Known.One.setHighBits(ShAmt);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumVecElts = VecVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
	unsigned Idx = CIdx->getZExtValue();
	unsigned VecBitWidth = VecVT.getScalarSizeInBits();

	// If we demand no bits from the vector then we must have demanded
	// bits from the implict zext - simplify to zero.
	APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
	if (DemandedVecBits == 0)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	APInt KnownUndef, KnownZero;
	APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
	if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	KnownBits KnownVec;
	if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	if (SDValue V = SimplifyMultipleUseDemandedBits(
	Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

	Known = KnownVec.zext(BitWidth);
	return false;
	}
	break;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();

	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
	unsigned Idx = CIdx->getZExtValue();
	if (!OriginalDemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);

	KnownBits KnownVec;
	APInt DemandedVecElts(OriginalDemandedElts);
	DemandedVecElts.clearBit(Idx);
	if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	KnownBits KnownScl;
	unsigned NumSclBits = Scl.getScalarValueSizeInBits();
	APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
	if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
	return true;

	KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
	Known.One = KnownVec.One & KnownScl.One;
	Known.Zero = KnownVec.Zero & KnownScl.Zero;
	return false;
	}
	break;
	}
	case X86ISD::PACKSS:
	// PACKSS saturates to MIN/MAX integer values. So if we just want the
	// sign bit then we can just ask for the source operands sign bit.
	// TODO - add known bits handling.
	if (OriginalDemandedBits.isSignMask()) {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

	KnownBits KnownLHS, KnownRHS;
	APInt SignMask = APInt::getSignMask(BitWidth * 2);
	if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
	KnownLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
	KnownRHS, TLO, Depth + 1))
	return true;

	// Attempt to avoid multi-use ops if we don't need anything from them.
	SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
	Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
	SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
	Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
	if (DemandedOp0 \|\| DemandedOp1) {
	SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
	SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
	}
	}
	// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (OriginalDemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return TLO.CombineTo(Op, Op.getOperand(1));
	break;
	case X86ISD::MOVMSK: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// If we don't need the sign bits at all just return zero.
	if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	// Only demand the vector elements of the sign bits we need.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
	if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	TLO, Depth + 1))
	return true;

	Known.Zero = KnownZero.zextOrSelf(BitWidth);
	Known.Zero.setHighBits(BitWidth - NumElts);

	// MOVMSK only uses the MSB from each vector element.
	KnownBits KnownSrc;
	APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
	if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
	Depth + 1))
	return true;

	if (KnownSrc.One[SrcBits - 1])
	Known.One.setLowBits(NumElts);
	else if (KnownSrc.Zero[SrcBits - 1])
	Known.Zero.setLowBits(NumElts);

	// Attempt to avoid multi-use os if we don't need anything from it.
	if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
	Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
	return false;
	}
	case X86ISD::BEXTR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Only bottom 16-bits of the control bits are required.
	if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	// NOTE: SimplifyDemandedBits won't do this for constants.
	const APInt &Val1 = Cst1->getAPIntValue();
	APInt MaskedVal1 = Val1 & 0xFFFF;
	if (MaskedVal1 != Val1) {
	SDLoc DL(Op);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
	TLO.DAG.getConstant(MaskedVal1, DL, VT)));
	}
	}

	KnownBits Known1;
	APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
	if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
	return true;

	// If the length is 0, replace with 0.
	KnownBits LengthBits = Known1.extractBits(8, 8);
	if (LengthBits.isZero())
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	break;
	}
	}

	return TargetLowering::SimplifyDemandedBitsForTargetNode(
	Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
	}

	SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	SelectionDAG &DAG, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	switch (Opc) {
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	// If we don't demand the inserted element, return the base vector.
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();
	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
	!DemandedElts[CIdx->getZExtValue()])
	return Vec;
	break;
	}
	case X86ISD::VSHLI: {
	// If we are only demanding sign bits then we can use the shift source
	// directly.
	SDValue Op0 = Op.getOperand(0);
	unsigned ShAmt = Op.getConstantOperandVal(1);
	unsigned BitWidth = DemandedBits.getBitWidth();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
	unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
	if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
	return Op0;
	break;
	}
	case X86ISD::VSRAI:
	// iff we only need the sign bit then we can use the source directly.
	// TODO: generalize where we only demand extended signbits.
	if (DemandedBits.isSignMask())
	return Op.getOperand(0);
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (DemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return Op.getOperand(1);
	break;
	}

	APInt ShuffleUndef, ShuffleZero;
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
	ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
	// If all the demanded elts are from one operand and are inline,
	// then we can use the operand directly.
	int NumOps = ShuffleOps.size();
	if (ShuffleMask.size() == (unsigned)NumElts &&
	llvm::all_of(ShuffleOps, [VT](SDValue V) {
	return VT.getSizeInBits() == V.getValueSizeInBits();
	})) {

	if (DemandedElts.isSubsetOf(ShuffleUndef))
	return DAG.getUNDEF(VT);
	if (DemandedElts.isSubsetOf(ShuffleUndef \| ShuffleZero))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

	// Bitmask that indicates which ops have only been accessed 'inline'.
	APInt IdentityOp = APInt::getAllOnesValue(NumOps);
	for (int i = 0; i != NumElts; ++i) {
	int M = ShuffleMask[i];
	if (!DemandedElts[i] \|\| ShuffleUndef[i])
	continue;
	int OpIdx = M / NumElts;
	int EltIdx = M % NumElts;
	if (M < 0 \|\| EltIdx != i) {
	IdentityOp.clearAllBits();
	break;
	}
	IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
	if (IdentityOp == 0)
	break;
	}
	assert((IdentityOp == 0 \|\| IdentityOp.countPopulation() == 1) &&
	"Multiple identity shuffles detected");

	if (IdentityOp != 0)
	return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
	}
	}

	return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
	Op, DemandedBits, DemandedElts, DAG, Depth);
	}

	// Helper to peek through bitops/setcc to determine size of source vector.
	// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
	static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return Src.getOperand(0).getValueSizeInBits() == Size;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
	checkBitcastSrcVectorSize(Src.getOperand(1), Size);
	}
	return false;
	}

	// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
	static unsigned getAltBitOpcode(unsigned Opcode) {
	switch(Opcode) {
	case ISD::AND: return X86ISD::FAND;
	case ISD::OR: return X86ISD::FOR;
	case ISD::XOR: return X86ISD::FXOR;
	case X86ISD::ANDNP: return X86ISD::FANDN;
	}
	llvm_unreachable("Unknown bitwise opcode");
	}

	// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
	static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
	const SDLoc &DL) {
	EVT SrcVT = Src.getValueType();
	if (SrcVT != MVT::v4i1)
	return SDValue();

	switch (Src.getOpcode()) {
	case ISD::SETCC:
	if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
	ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
	cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
	SDValue Op0 = Src.getOperand(0);
	if (ISD::isNormalLoad(Op0.getNode()))
	return DAG.getBitcast(MVT::v4f32, Op0);
	if (Op0.getOpcode() == ISD::BITCAST &&
	Op0.getOperand(0).getValueType() == MVT::v4f32)
	return Op0.getOperand(0);
	}
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
	SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
	if (Op0 && Op1)
	return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
	Op1);
	break;
	}
	}
	return SDValue();
	}

	// Helper to push sign extension of vXi1 SETCC result through bitops.
	static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
	SDValue Src, const SDLoc &DL) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return DAG.getNode(
	Src.getOpcode(), DL, SExtVT,
	signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
	signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
	}
	llvm_unreachable("Unexpected node type for vXi1 sign extension");
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
	const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isSimple() \|\| SrcVT.getScalarType() != MVT::i1)
	return SDValue();

	// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
	// legalization destroys the v4i32 type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
	if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
	DAG.getBitcast(MVT::v4f32, V));
	return DAG.getZExtOrTrunc(V, DL, VT);
	}
	}

	// If the input is a truncate from v16i8 or v32i8 go ahead and use a
	// movmskb even with avx512. This will be better than truncating to vXi1 and
	// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
	// vpcmpeqb/vpcmpgtb.
	bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
	(Src.getOperand(0).getValueType() == MVT::v16i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v32i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v64i8);

	// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
	// directly with vpmovmskb/vmovmskps/vmovmskpd.
	if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
	cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
	ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
	EVT CmpVT = Src.getOperand(0).getValueType();
	EVT EltVT = CmpVT.getVectorElementType();
	if (CmpVT.getSizeInBits() <= 256 &&
	(EltVT == MVT::i8 \|\| EltVT == MVT::i32 \|\| EltVT == MVT::i64))
	PreferMovMsk = true;
	}

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (!Subtarget.hasSSE2() \|\| (Subtarget.hasAVX512() && !PreferMovMsk))
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	bool PropagateSExt = false;
	switch (SrcVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
	SExtVT = MVT::v4i64;
	PropagateSExt = true;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) \|\|
	checkBitcastSrcVectorSize(Src, 512))) {
	SExtVT = MVT::v8i32;
	PropagateSExt = true;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	case MVT::v64i1:
	// If we have AVX512F, but not AVX512BW and the input is truncated from
	// v64i8 checked earlier. Then split the input and make two pmovmskbs.
	if (Subtarget.hasAVX512()) {
	if (Subtarget.hasBWI())
	return SDValue();
	SExtVT = MVT::v64i8;
	break;
	}
	// Split if this is a <64 x i8> comparison result.
	if (checkBitcastSrcVectorSize(Src, 512)) {
	SExtVT = MVT::v64i8;
	break;
	}
	return SDValue();
	};

	SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
	: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

	if (SExtVT == MVT::v16i8 \|\| SExtVT == MVT::v32i8 \|\| SExtVT == MVT::v64i8) {
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	} else {
	if (SExtVT == MVT::v8i16)
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	EVT IntVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
	V = DAG.getZExtOrTrunc(V, DL, IntVT);
	return DAG.getBitcast(VT, V);
	}

	// Convert a vXi1 constant build vector to the same width scalar integer.
	static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
	EVT SrcVT = Op.getValueType();
	assert(SrcVT.getVectorElementType() == MVT::i1 &&
	"Expected a vXi1 vector");
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	"Expected a constant build vector");

	APInt Imm(SrcVT.getVectorNumElements(), 0);
	for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
	SDValue In = Op.getOperand(Idx);
	if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
	Imm.setBit(Idx);
	}
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
	return DAG.getConstant(Imm, SDLoc(Op), IntVT);
	}

	static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// Only do this if we have k-registers.
	if (!Subtarget.hasAVX512())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDValue Op = N->getOperand(0);
	EVT SrcVT = Op.getValueType();

	if (!Op.hasOneUse())
	return SDValue();

	// Look for logic ops.
	if (Op.getOpcode() != ISD::AND &&
	Op.getOpcode() != ISD::OR &&
	Op.getOpcode() != ISD::XOR)
	return SDValue();

	// Make sure we have a bitcast between mask registers and a scalar type.
	if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	DstVT.isScalarInteger()) &&
	!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
	SrcVT.isScalarInteger()))
	return SDValue();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
	DAG.getBitcast(DstVT, RHS));

	if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

	// If the RHS is a vXi1 build vector, this is a good reason to flip too.
	// Most of these have to move a constant from the scalar domain anyway.
	if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS);
	}

	return SDValue();
	}

	static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(BV);
	unsigned NumElts = BV->getNumOperands();
	SDValue Splat = BV->getSplatValue();

	// Build MMX element from integer GPR or SSE float values.
	auto CreateMMXElement = [&](SDValue V) {
	if (V.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);
	if (V.getValueType().isFloatingPoint()) {
	if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
	V = DAG.getBitcast(MVT::v2i64, V);
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
	}
	V = DAG.getBitcast(MVT::i32, V);
	} else {
	V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
	}
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
	};

	// Convert build vector ops to MMX data in the bottom elements.
	SmallVector<SDValue, 8> Ops;

	// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
	if (Splat) {
	if (Splat.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);

	Splat = CreateMMXElement(Splat);

	if (Subtarget.hasSSE1()) {
	// Unpack v8i8 to splat i8 elements to lowest 16-bits.
	if (NumElts == 8)
	Splat = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
	Splat);

	// Use PSHUFW to repeat 16-bit elements.
	unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
	Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
	}
	Ops.append(NumElts, Splat);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Ops.push_back(CreateMMXElement(BV->getOperand(i)));
	}

	// Use tree of PUNPCKLs to build up general MMX vector.
	while (Ops.size() > 1) {
	unsigned NumOps = Ops.size();
	unsigned IntrinOp =
	(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
	: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
	: Intrinsic::x86_mmx_punpcklbw));
	SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
	for (unsigned i = 0; i != NumOps; i += 2)
	Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
	Ops[i], Ops[i + 1]);
	Ops.resize(NumOps / 2);
	}

	return Ops[0];
	}

	// Recursive function that attempts to find if a bool vector node was originally
	// a vector/float/double that got truncated/extended/bitcast to/from a scalar
	// integer. If so, replace the scalar ops with bool vector equivalents back down
	// the chain.
	static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned Opc = V.getOpcode();
	switch (Opc) {
	case ISD::BITCAST: {
	// Bitcast from a vector/float/double, we can cheaply bitcast to VT.
	SDValue Src = V.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (SrcVT.isVector() \|\| SrcVT.isFloatingPoint())
	return DAG.getBitcast(VT, Src);
	break;
	}
	case ISD::TRUNCATE: {
	// If we find a suitable source, a truncated scalar becomes a subvector.
	SDValue Src = V.getOperand(0);
	EVT NewSrcVT =
	EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
	if (TLI.isTypeLegal(NewSrcVT))
	if (SDValue N0 =
	combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
	DAG.getIntPtrConstant(0, DL));
	break;
	}
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: {
	// If we find a suitable source, an extended scalar becomes a subvector.
	SDValue Src = V.getOperand(0);
	EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	Src.getScalarValueSizeInBits());
	if (TLI.isTypeLegal(NewSrcVT))
	if (SDValue N0 =
	combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
	: DAG.getConstant(0, DL, VT),
	N0, DAG.getIntPtrConstant(0, DL));
	break;
	}
	case ISD::OR: {
	// If we find suitable sources, we can just move an OR to the vector domain.
	SDValue Src0 = V.getOperand(0);
	SDValue Src1 = V.getOperand(1);
	if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
	if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
	return DAG.getNode(Opc, DL, VT, N0, N1);
	break;
	}
	case ISD::SHL: {
	// If we find a suitable source, a SHL becomes a KSHIFTL.
	SDValue Src0 = V.getOperand(0);
	if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
	if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
	return DAG.getNode(
	X86ISD::KSHIFTL, DL, VT, N0,
	DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
	break;
	}
	}
	return SDValue();
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	SDLoc dl(N);
	if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	// Use zeros for the widening if we already have some zeroes. This can
	// allow SimplifyDemandedBits to remove scalar ANDs that may be down
	// stream of this.
	// FIXME: It might make sense to detect a concat_vectors with a mix of
	// zeroes and undef and turn it into insert_subvector for i1 vectors as
	// a separate combine. What we can't do is canonicalize the operands of
	// such a concat or we'll get into a loop with SimplifyDemandedBits.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
	SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
	if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
	SrcVT = LastOp.getValueType();
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
	Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	} else {
	// If we're bitcasting from iX to vXi1, see if the integer originally
	// began as a vXi1 and whether we can remove the bitcast entirely.
	if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
	SrcVT.isScalarInteger() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	if (SDValue V =
	combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
	return V;
	}
	}

	// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
	// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
	// due to insert_subvector legalization on KNL. By promoting the copy to i16
	// we can help with known bits propagation from the vXi1 domain to the
	// scalar domain.
	if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
	!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == MVT::v16i1 &&
	isNullConstant(N0.getOperand(1)))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
	DAG.getBitcast(MVT::i16, N0.getOperand(0)));

	// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
	// and the vbroadcast_load are both integer or both fp. In some cases this
	// will remove the bitcast entirely.
	if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
	VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
	auto *BCast = cast<MemIntrinsicSDNode>(N0);
	unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
	unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
	// Don't swap i8/i16 since don't have fp types that size.
	if (MemSize >= 32) {
	MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
	: MVT::getIntegerVT(MemSize);
	MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
	: MVT::getIntegerVT(SrcVTSize);
	LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
	MemVT, BCast->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.
	if (VT == MVT::x86mmx) {
	// Detect MMX constant vectors.
	APInt UndefElts;
	SmallVector<APInt, 1> EltBits;
	if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
	SDLoc DL(N0);
	// Handle zero-extension of i32 with MOVD.
	if (EltBits[0].countLeadingZeros() >= 32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
	DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
	// Else, bitcast to a double.
	// TODO - investigate supporting sext 32-bit immediates on x86_64.
	APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
	return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
	}

	// Detect bitcasts to x86mmx low word.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8) &&
	N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
	bool LowUndef = true, AllUndefOrZero = true;
	for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N0.getOperand(i);
	LowUndef &= Op.isUndef() \|\| (i >= e/2);
	AllUndefOrZero &= (Op.isUndef() \|\| isNullConstant(Op));
	}
	if (AllUndefOrZero) {
	SDValue N00 = N0.getOperand(0);
	SDLoc dl(N00);
	N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
	: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
	return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
	}
	}

	// Detect bitcasts of 64-bit build vectors and convert to a
	// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
	// lowest element.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2f32 \|\| SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\|
	SrcVT == MVT::v8i8))
	return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}
	}

	// Try to remove a bitcast of constant vXi1 vector. We have to legalize
	// most of these to scalar anyway.
	if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
	SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	return combinevXi1ConstantToInteger(N0, DAG);
	}

	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isa<ConstantSDNode>(N0)) {
	auto *C = cast<ConstantSDNode>(N0);
	if (C->isAllOnesValue())
	return DAG.getConstant(1, SDLoc(N0), VT);
	if (C->isNullValue())
	return DAG.getConstant(0, SDLoc(N0), VT);
	}

	// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
	// Turn it into a sign bit compare that produces a k-register. This avoids
	// a trip through a GPR.
	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isPowerOf2_32(VT.getVectorNumElements())) {
	unsigned NumElts = VT.getVectorNumElements();
	SDValue Src = N0;

	// Peek through truncate.
	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
	Src = N0.getOperand(0);

	if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
	SDValue MovmskIn = Src.getOperand(0);
	MVT MovmskVT = MovmskIn.getSimpleValueType();
	unsigned MovMskElts = MovmskVT.getVectorNumElements();

	// We allow extra bits of the movmsk to be used since they are known zero.
	// We can't convert a VPMOVMSKB without avx512bw.
	if (MovMskElts <= NumElts &&
	(Subtarget.hasBWI() \|\| MovmskVT.getVectorElementType() != MVT::i8)) {
	EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
	MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
	SDLoc dl(N);
	MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
	SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
	DAG.getConstant(0, dl, IntVT), ISD::SETLT);
	if (EVT(CmpVT) == VT)
	return Cmp;

	// Pad with zeroes up to original VT to replace the zeroes that were
	// being used from the MOVMSK.
	unsigned NumConcats = NumElts / MovMskElts;
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
	Ops[0] = Cmp;
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
	}
	}
	}

	// Try to remove bitcasts from input and output of mask arithmetic to
	// remove GPR<->K-register crossings.
	if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
	return V;

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Given a ABS node, detect the following pattern:
	// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
	SDValue AbsOp1 = Abs->getOperand(0);
	if (AbsOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = AbsOp1.getOperand(0);
	Op1 = AbsOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
	auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
	};
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
	PSADBWBuilder);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Src = DAG.matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
	SrcVT = Lo.getValueType();
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
	return SDValue();

	// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match && ExtractVT == MVT::i1)
	Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	SDValue Movmsk;
	SDLoc DL(Extract);
	EVT MatchVT = Match.getValueType();
	unsigned NumElts = MatchVT.getVectorNumElements();
	unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (ExtractVT == MVT::i1) {
	// Special case for (pre-legalization) vXi1 reductions.
	if (NumElts > 64 \|\| !isPowerOf2_32(NumElts))
	return SDValue();
	if (TLI.isTypeLegal(MatchVT)) {
	// If this is a legal AVX512 predicate type then we can just bitcast.
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = DAG.getBitcast(MovmskVT, Match);
	} else {
	// For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
	// PCMPEQQ (SSE41+), use PCMPEQD instead.
	if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
	Match.getOpcode() == ISD::SETCC &&
	ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
	cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
	ISD::CondCode::SETEQ) {
	SDValue Vec = Match.getOperand(0);
	if (Vec.getValueType().getScalarType() == MVT::i64 &&
	(2 * NumElts) <= MaxElts) {
	NumElts *= 2;
	EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Match = DAG.getSetCC(
	DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
	DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
	}
	}

	// Use combineBitcastvxi1 to create the MOVMSK.
	while (NumElts > MaxElts) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	NumElts /= 2;
	}
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
	}
	if (!Movmsk)
	return SDValue();
	Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
	} else {
	// FIXME: Better handling of k-registers or 512-bit vectors?
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 && Subtarget.hasAVX())))
	return SDValue();

	// Make sure this isn't a vector of 1 element. The perf win from using
	// MOVMSK diminishes with less elements in the reduction, but it is
	// generally better to get the comparison over to the GPRs as soon as
	// possible to reduce the number of vector ops.
	if (Match.getValueType().getVectorNumElements() < 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	MatchSizeInBits = Match.getValueSizeInBits();
	}

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskSrcVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
	Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
	NumElts = MaskSrcVT.getVectorNumElements();
	}
	assert((NumElts <= 32 \|\| NumElts == 64) &&
	"Not expecting more than 64 elements");

	MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
	if (BinOp == ISD::XOR) {
	// parity -> (AND (CTPOP(MOVMSK X)), 1)
	SDValue Mask = DAG.getConstant(1, DL, CmpVT);
	SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
	Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
	return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
	}

	SDValue CmpC;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CmpC = DAG.getConstant(0, DL, CmpVT);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
	DL, CmpVT);
	CondCode = ISD::CondCode::SETEQ;
	}

	// The setcc produces an i8 of 0/1, so extend that to the result width and
	// negate to get the final 0/-1 mask value.
	EVT SetccVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
	SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
	SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
	SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
	return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	// Verify the type we're extracting is either i32 or i64.
	// FIXME: Could support other types, but this is what we have coverage for.
	if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
	return SDValue();

	EVT VT = Extract->getOperand(0).getValueType();
	if (!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Match shuffle + add pyramid.
	ISD::NodeType BinOp;
	SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| Root.getOpcode() != ISD::ABS)
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	EVT SadVT = SAD.getValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
	// Return the lowest ExtractSizeInBits bits.
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
	SadVT.getSizeInBits() / ExtractSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc dl(N);
	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned SrcEltBits = SrcSVT.getSizeInBits();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	const APInt &IdxC = N->getConstantOperandAPInt(1);
	if (IdxC.uge(NumSrcElts))
	return SDValue();

	SDValue SrcBC = peekThroughBitcasts(Src);

	// Handle extract(bitcast(broadcast(scalar_value))).
	if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
	SDValue SrcOp = SrcBC.getOperand(0);
	EVT SrcOpVT = SrcOp.getValueType();
	if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
	(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
	unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
	unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
	// TODO support non-zero offsets.
	if (Offset == 0) {
	SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
	SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
	return SrcOp;
	}
	}
	}

	// If we're extracting a single element from a broadcast load and there are
	// no other users, just create a single load.
	if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
	unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
	if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
	VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
	SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
	MemIntr->getBasePtr(),
	MemIntr->getPointerInfo(),
	MemIntr->getOriginalAlign(),
	MemIntr->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
	return Load;
	}
	}

	// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
	// TODO: Move to DAGCombine?
	if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
	SrcBC.getValueType().isInteger() &&
	(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
	SrcBC.getScalarValueSizeInBits() ==
	SrcBC.getOperand(0).getValueSizeInBits()) {
	unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
	if (IdxC.ult(Scale)) {
	unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
	SDValue Scl = SrcBC.getOperand(0);
	EVT SclVT = Scl.getValueType();
	if (Offset) {
	Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
	DAG.getShiftAmountConstant(Offset, SclVT, dl));
	}
	Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
	Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
	return Scl;
	}
	}

	// Handle extract(truncate(x)) for 0'th index.
	// TODO: Treat this as a faux shuffle?
	// TODO: When can we use this for general indices?
	if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
	Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
	Src = DAG.getBitcast(SrcVT, Src);
	return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
	}

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
	return SDValue();

	// Shuffle inputs must be the same size as the result.
	if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
	return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
	}))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	narrowShuffleMaskElts(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	// Simplify Mask based on demanded element.
	int ExtractIdx = (int)N->getConstantOperandVal(1);
	int Scale = Mask.size() / NumSrcElts;
	int Lo = Scale * ExtractIdx;
	int Hi = Scale * (ExtractIdx + 1);
	for (int i = 0, e = (int)Mask.size(); i != e; ++i)
	if (i < Lo \|\| Hi <= i)
	Mask[i] = SM_SentinelUndef;

	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[IdxC.getZExtValue()];

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Extracting a scalar FP value from vector element 0 is free, so extract each
	/// operand first, then perform the math as a scalar op.
	static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
	SDValue Vec = ExtElt->getOperand(0);
	SDValue Index = ExtElt->getOperand(1);
	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Vec.getValueType();

	// TODO: If this is a unary/expensive/expand op, allow extraction from a
	// non-zero element because the shuffle+scalar op will be cheaper?
	if (!Vec.hasOneUse() \|\| !isNullConstant(Index) \|\| VecVT.getScalarType() != VT)
	return SDValue();

	// Vector FP compares don't fit the pattern of FP math ops (propagate, not
	// extract, the condition code), so deal with those as a special-case.
	if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
	EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
	if (OpVT != MVT::f32 && OpVT != MVT::f64)
	return SDValue();

	// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(1), Index);
	return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
	}

	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Vector FP selects don't fit the pattern of FP math ops (because the
	// condition has a different type and we have to change the opcode), so deal
	// with those here.
	// FIXME: This is restricted to pre type legalization by ensuring the setcc
	// has i1 elements. If we loosen this we need to convert vector bool to a
	// scalar bool.
	if (Vec.getOpcode() == ISD::VSELECT &&
	Vec.getOperand(0).getOpcode() == ISD::SETCC &&
	Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
	Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
	// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	Vec.getOperand(0).getValueType().getScalarType(),
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(1), Index);
	SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(2), Index);
	return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
	}

	// TODO: This switch could include FNEG and the x86-specific FP logic ops
	// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
	// missed load folding and fma+fneg combining.
	switch (Vec.getOpcode()) {
	case ISD::FMA: // Begin 3 operands
	case ISD::FMAD:
	case ISD::FADD: // Begin 2 operands
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FCOPYSIGN:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMAXIMUM:
	case ISD::FMINIMUM:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case ISD::FABS: // Begin 1 operand
	case ISD::FSQRT:
	case ISD::FRINT:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FFLOOR:
	case X86ISD::FRCP:
	case X86ISD::FRSQRT: {
	// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
	SDLoc DL(ExtElt);
	SmallVector<SDValue, 4> ExtOps;
	for (SDValue Op : Vec->ops())
	ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
	return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
	}
	default:
	return SDValue();
	}
	llvm_unreachable("All opcodes should return within switch");
	}

	/// Try to convert a vector reduction sequence composed of binops and shuffles
	/// into horizontal ops.
	static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");

	// We need at least SSE2 to anything here.
	if (!Subtarget.hasSSE2())
	return SDValue();

	ISD::NodeType Opc;
	SDValue Rdx =
	DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
	if (!Rdx)
	return SDValue();

	SDValue Index = ExtElt->getOperand(1);
	assert(isNullConstant(Index) &&
	"Reduction doesn't end in an extract from index 0");

	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Rdx.getValueType();
	if (VecVT.getScalarType() != VT)
	return SDValue();

	SDLoc DL(ExtElt);

	// vXi8 reduction - sub 128-bit vector.
	if (VecVT == MVT::v4i8 \|\| VecVT == MVT::v8i8) {
	if (VecVT == MVT::v4i8) {
	// Pad with zero.
	if (Subtarget.hasSSE41()) {
	Rdx = DAG.getBitcast(MVT::i32, Rdx);
	Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
	DAG.getConstant(0, DL, MVT::v4i32), Rdx,
	DAG.getIntPtrConstant(0, DL));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	} else {
	Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
	DAG.getConstant(0, DL, VecVT));
	}
	}
	if (Rdx.getValueType() == MVT::v8i8) {
	// Pad with undef.
	Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
	DAG.getUNDEF(MVT::v8i8));
	}
	Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
	DAG.getConstant(0, DL, MVT::v16i8));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	// Must be a >=128-bit vector with pow2 elements.
	if ((VecVT.getSizeInBits() % 128) != 0 \|\|
	!isPowerOf2_32(VecVT.getVectorNumElements()))
	return SDValue();

	// vXi8 reduction - sum lo/hi halves then use PSADBW.
	if (VT == MVT::i8) {
	while (Rdx.getValueSizeInBits() > 128) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
	VecVT = Lo.getValueType();
	Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
	}
	assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");

	SDValue Hi = DAG.getVectorShuffle(
	MVT::v16i8, DL, Rdx, Rdx,
	{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
	Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
	Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
	getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
	Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
	if (!shouldUseHorizontalOp(true, DAG, Subtarget))
	return SDValue();

	unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

	// 256-bit horizontal instructions operate on 128-bit chunks rather than
	// across the whole vector, so we need an extract + hop preliminary stage.
	// This is the only step where the operands of the hop are not the same value.
	// TODO: We could extend this to handle 512-bit or even longer vectors.
	if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|
	((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
	unsigned NumElts = VecVT.getVectorNumElements();
	SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
	SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
	Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
	VecVT = Rdx.getValueType();
	}
	if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
	!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
	return SDValue();

	// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
	unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
	for (unsigned i = 0; i != ReductionSteps; ++i)
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);
	bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

	// Integer Constant Folding.
	if (CIdx && VT.isInteger()) {
	APInt UndefVecElts;
	SmallVector<APInt, 16> EltBits;
	unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
	if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
	EltBits, true, false)) {
	uint64_t Idx = CIdx->getZExtValue();
	if (UndefVecElts[Idx])
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
	return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
	dl, VT);
	}
	}

	if (IsPextr) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(
	SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
	return SDValue(N, 0);

	// PEXTR(PINSR(v, s, c), c) -> s (with implicit zext handling).
	if ((InputVector.getOpcode() == X86ISD::PINSRB \|\|
	InputVector.getOpcode() == X86ISD::PINSRW) &&
	InputVector.getOperand(2) == EltIdx) {
	assert(SrcVT == InputVector.getOperand(0).getValueType() &&
	"Vector type mismatch");
	SDValue Scl = InputVector.getOperand(1);
	Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
	return DAG.getZExtOrTrunc(Scl, dl, VT);
	}

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	return SDValue();
	}

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
	return V;

	if (SDValue V = scalarizeExtEltFP(N, DAG))
	return V;

	// Attempt to extract a i1 element by using MOVMSK to extract the signbits
	// and then testing the relevant element.
	//
	// Note that we only combine extracts on the same result number, i.e.
	// t0 = merge_values a0, a1, a2, a3
	// i1 = extract_vector_elt t0, Constant:i64<2>
	// i1 = extract_vector_elt t0, Constant:i64<3>
	// but not
	// i1 = extract_vector_elt t0:1, Constant:i64<2>
	// since the latter would need its own MOVMSK.
	if (CIdx && SrcVT.getScalarType() == MVT::i1) {
	SmallVector<SDNode *, 16> BoolExtracts;
	unsigned ResNo = InputVector.getResNo();
	auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
	if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Use->getOperand(1)) &&
	Use->getOperand(0).getResNo() == ResNo &&
	Use->getValueType(0) == MVT::i1) {
	BoolExtracts.push_back(Use);
	return true;
	}
	return false;
	};
	if (all_of(InputVector->uses(), IsBoolExtract) &&
	BoolExtracts.size() > 1) {
	EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
	if (SDValue BC =
	combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
	for (SDNode *Use : BoolExtracts) {
	// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
	unsigned MaskIdx = Use->getConstantOperandVal(1);
	APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
	SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
	SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
	Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
	DCI.CombineTo(Use, Res);
	}
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
	// TODO: Can we assert that both operands are not zeros (because that should
	// get simplified at node creation time)?
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// If both inputs are 0/undef, create a complete zero vector.
	// FIXME: As noted above this should be handled by DAGCombiner/getNode.
	if (TValIsAllZeros && FValIsAllZeros) {
	if (VT.isFloatingPoint())
	return DAG.getConstantFP(0.0, DL, VT);
	return DAG.getConstant(0, DL, VT);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s. Only do this if the condition has one use.
	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC = ISD::getSetCCInverse(
	cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	/// If both arms of a vector select are concatenated vectors, split the select,
	/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
	/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
	/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
	static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
	return SDValue();

	// TODO: Split 512-bit vectors too?
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	// TODO: Split as long as any 2 of the 3 operands are concatenated?
	SDValue Cond = N->getOperand(0);
	SDValue TVal = N->getOperand(1);
	SDValue FVal = N->getOperand(2);
	SmallVector<SDValue, 4> CatOpsT, CatOpsF;
	if (!TVal.hasOneUse() \|\| !FVal.hasOneUse() \|\|
	!collectConcatOps(TVal.getNode(), CatOpsT) \|\|
	!collectConcatOps(FVal.getNode(), CatOpsF))
	return SDValue();

	auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
	makeBlend, /CheckBWI/ false);
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	/// If this is a dynamic select (non-constant condition) and we can match
	/// this node with one of the variable blend instructions, restructure the
	/// condition so that blends can use the high (sign) bit of each element.
	/// This function will also call SimplifyDemandedBits on already created
	/// BLENDV to perform additional simplifications.
	static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	if ((N->getOpcode() != ISD::VSELECT &&
	N->getOpcode() != X86ISD::BLENDV) \|\|
	ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();

	// Don't optimize before the condition has been transformed to a legal type
	// and don't ever optimize vector selects that map to AVX512 mask-registers.
	unsigned BitWidth = Cond.getScalarValueSizeInBits();
	if (BitWidth < 8 \|\| BitWidth > 64)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = N->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	auto OnlyUsedAsSelectCond = [](SDValue Cond) {
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI)
	if ((UI->getOpcode() != ISD::VSELECT &&
	UI->getOpcode() != X86ISD::BLENDV) \|\|
	UI.getOperandNo() != 0)
	return false;

	return true;
	};

	APInt DemandedBits(APInt::getSignMask(BitWidth));

	if (OnlyUsedAsSelectCond(Cond)) {
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
	return SDValue();

	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Update all the nodes so that we do not use
	// the generic VSELECT anymore. Otherwise, we may perform wrong
	// optimizations as we messed with the actual expectation for the vector
	// boolean values.
	for (SDNode *U : Cond->uses()) {
	if (U->getOpcode() == X86ISD::BLENDV)
	continue;

	SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
	Cond, U->getOperand(1), U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	DCI.AddToWorklist(U);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	// Otherwise we can still at least try to simplify multiple use bits.
	if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
	return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
	N->getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of:
	// (select M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoConditionalNegate(
	EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	EVT MaskVT = Mask.getValueType();
	assert(MaskVT.isInteger() &&
	DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
	"Mask must be zero/all-bits");

	if (X.getValueType() != MaskVT \|\| Y.getValueType() != MaskVT)
	return SDValue();
	if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
	return SDValue();

	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};

	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;
	else
	return SDValue();

	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);

	// Try simplification again because we use this function to optimize
	// BLENDV nodes that are not handled by the generic combiner.
	if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
	return V;

	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

	// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
	// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
	// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
	if (CondVT.isVector() && CondVT.isInteger() &&
	CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
	(!CondConstantVector \|\| CondVT.getScalarType() == MVT::i8) &&
	DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
	if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
	DL, DAG, Subtarget))
	return V;

	// Convert vselects with constant condition into shuffles.
	if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
	SmallVector<int, 64> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
	}

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) &&
	!DAG.isKnownNeverZeroFloat(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// Some mask scalar intrinsics rely on checking if only one bit is set
	// and implement it in C code like this:
	// A[0] = (U & 1) ? A[0] : W[0];
	// This creates some redundant instructions that break pattern matching.
	// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
	if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
	Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 \|\| VT == MVT::f64)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	SDValue AndNode = Cond.getOperand(0);
	if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	isOneConstant(AndNode.getOperand(1))) {
	// LHS and RHS swapped due to
	// setcc outputting 1 when AND resulted in 0 and vice versa.
	AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
	return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
	}
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation all vectors of i8 and i16 without BWI.
	// Make sure we extend these even before type legalization gets a chance to
	// split wide vectors.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	// AVX512 - Extend select with zero to merge with target shuffle.
	// select(mask, extract_subvector(shuffle(x)), zero) -->
	// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
	// TODO - support non target shuffles as well.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1) {
	auto SelectableOp = [&TLI](SDValue Op) {
	return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isTargetShuffle(Op.getOperand(0).getOpcode()) &&
	isNullConstant(Op.getOperand(1)) &&
	TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
	Op.hasOneUse() && Op.getOperand(0).hasOneUse();
	};

	bool SelectableLHS = SelectableOp(LHS);
	bool SelectableRHS = SelectableOp(RHS);
	bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

	if ((SelectableLHS && ZeroRHS) \|\| (SelectableRHS && ZeroLHS)) {
	EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
	: RHS.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
	LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
	VT.getSizeInBits());
	RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
	VT.getSizeInBits());
	Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
	DAG.getUNDEF(SrcCondVT), Cond,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
	return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
	}
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	Cond.hasOneUse() &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	Other->getOperand(0) == Cond.getOperand(0)) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
	if (isa<BuildVectorSDNode>(CondRHS)) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	auto MatchUSUBSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return (!Op && !Cond) \|\|
	(Op && Cond &&
	Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
	};
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
	/AllowUndefs/ true)) {
	OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	OpRHS);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask()) {
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}
	}
	}
	}

	// Match VSELECTs into add with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// paddus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	SDValue CondLHS = Cond->getOperand(0);
	SDValue CondRHS = Cond->getOperand(1);

	// Check if one of the arms of the VSELECT is vector with all bits set.
	// If it's on the left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
	} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
	SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);

	// Canonicalize condition operands.
	if (CC == ISD::SETUGE) {
	std::swap(CondLHS, CondRHS);
	CC = ISD::SETULE;
	}

	// We can test against either of the addition operands.
	// x <= x+y ? x+y : ~0 --> addus x, y
	// x+y >= x ? x+y : ~0 --> addus x, y
	if (CC == ISD::SETULE && Other == CondRHS &&
	(OpLHS == CondLHS \|\| OpRHS == CondLHS))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

	if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
	CondLHS == OpLHS) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > ~C ? x+C : ~0 --> addus x, C
	auto MatchUADDSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return Cond->getAPIntValue() == ~Op->getAPIntValue();
	};
	if (CC == ISD::SETULE &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}

	// Check if the first operand is all zeros and Cond type is vXi1.
	// If this an avx512 target we can improve the use of zero masking by
	// swapping the operands and inverting the condition.
	if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
	Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorAllZeros(LHS.getNode()) &&
	!ISD::isBuildVectorAllZeros(RHS.getNode())) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
	return V;

	// select(~Cond, X, Y) -> select(Cond, Y, X)
	if (CondVT.getScalarType() != MVT::i1)
	if (SDValue CondNot = IsNOT(Cond, DAG))
	return DAG.getNode(N->getOpcode(), DL, VT,
	DAG.getBitcast(CondVT, CondNot), RHS, LHS);

	// Try to optimize vXi1 selects if both operands are either all constants or
	// bitcasts from scalar integer type. In that case we can convert the operands
	// to integer and use an integer select which will be converted to a CMOV.
	// We need to take a little bit of care to avoid creating an i64 type after
	// type legalization.
	if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(DCI.isBeforeLegalize() \|\| (VT != MVT::v64i1 \|\| Subtarget.is64Bit()))) {
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
	bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

	if ((LHSIsConst \|\|
	(LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == IntVT)) &&
	(RHSIsConst \|\|
	(RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == IntVT))) {
	if (LHSIsConst)
	LHS = combinevXi1ConstantToInteger(LHS, DAG);
	else
	LHS = LHS.getOperand(0);

	if (RHSIsConst)
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	else
	RHS = RHS.getOperand(0);

	SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}

	// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
	// single bits, then invert the predicate and swap the select operands.
	// This can lower using a vector shift bit-hack rather than mask and compare.
	if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
	N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
	Cond.getOperand(0).getOpcode() == ISD::AND &&
	isNullOrNullSplat(Cond.getOperand(1)) &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	Cond.getOperand(0).getValueType() == VT) {
	// The 'and' mask must be composed of power-of-2 constants.
	SDValue And = Cond.getOperand(0);
	auto *C = isConstOrConstSplat(And.getOperand(1));
	if (C && C->getAPIntValue().isPowerOf2()) {
	// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
	SDValue NotCond =
	DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
	return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
	}

	// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
	// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
	// 16-bit lacks a proper blendv.
	unsigned EltBitWidth = VT.getScalarSizeInBits();
	bool CanShiftBlend =
	TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) \|\|
	(Subtarget.hasAVX2() && EltBitWidth == 64) \|\|
	(Subtarget.hasXOP()));
	if (CanShiftBlend &&
	ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
	return C->getAPIntValue().isPowerOf2();
	})) {
	// Create a left-shift constant to get the mask bits over to the sign-bit.
	SDValue Mask = And.getOperand(1);
	SmallVector<int, 32> ShlVals;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
	ShlVals.push_back(EltBitWidth - 1 -
	MaskVal->getAPIntValue().exactLogBase2());
	}
	// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
	SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
	SDValue NewCond =
	DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
	return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
	}
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
	uint64_t CarryCC = Carry.getConstantOperandVal(0);
	SDValue CarryOp1 = Carry.getOperand(1);
	if (CarryCC == X86::COND_B)
	return CarryOp1;
	if (CarryCC == X86::COND_A) {
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp
	// instruction cannot take an immediate as its first operand.
	//
	if (CarryOp1.getOpcode() == X86ISD::SUB &&
	CarryOp1.getNode()->hasOneUse() &&
	CarryOp1.getValueType().isInteger() &&
	!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
	SDValue SubCommute =
	DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
	CarryOp1.getOperand(1), CarryOp1.getOperand(0));
	return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
	}
	}
	// If this is a check of the z flag of an add with 1, switch to the
	// C flag.
	if (CarryCC == X86::COND_E &&
	CarryOp1.getOpcode() == X86ISD::ADD &&
	isOneConstant(CarryOp1.getOperand(1)))
	return CarryOp1;
	}
	}
	}

	return SDValue();
	}

	/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
	/// to avoid the inversion.
	static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
	if (EFLAGS.getOpcode() != X86ISD::PTEST &&
	EFLAGS.getOpcode() != X86ISD::TESTP)
	return SDValue();

	// PTEST/TESTP sets EFLAGS as:
	// TESTZ: ZF = (Op0 & Op1) == 0
	// TESTC: CF = (~Op0 & Op1) == 0
	// TESTNZC: ZF == 0 && CF == 0
	EVT VT = EFLAGS.getValueType();
	SDValue Op0 = EFLAGS.getOperand(0);
	SDValue Op1 = EFLAGS.getOperand(1);
	EVT OpVT = Op0.getValueType();

	// TEST(~X,Y) == TEST(X,Y)
	if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
	X86::CondCode InvCC;
	switch (CC) {
	case X86::COND_B:
	// testc -> testz.
	InvCC = X86::COND_E;
	break;
	case X86::COND_AE:
	// !testc -> !testz.
	InvCC = X86::COND_NE;
	break;
	case X86::COND_E:
	// testz -> testc.
	InvCC = X86::COND_B;
	break;
	case X86::COND_NE:
	// !testz -> !testc.
	InvCC = X86::COND_AE;
	break;
	case X86::COND_A:
	case X86::COND_BE:
	// testnzc -> testnzc (no change).
	InvCC = CC;
	break;
	default:
	InvCC = X86::COND_INVALID;
	break;
	}

	if (InvCC != X86::COND_INVALID) {
	CC = InvCC;
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, NotOp0), Op1);
	}
	}

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	// TESTZ(X,~Y) == TESTC(Y,X)
	if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
	CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, NotOp1), Op0);
	}

	if (Op0 == Op1) {
	SDValue BC = peekThroughBitcasts(Op0);
	EVT BCVT = BC.getValueType();
	assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
	"Unexpected vector type");

	// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
	if (BC.getOpcode() == ISD::AND \|\| BC.getOpcode() == X86ISD::FAND) {
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, BC.getOperand(0)),
	DAG.getBitcast(OpVT, BC.getOperand(1)));
	}

	// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
	if (BC.getOpcode() == X86ISD::ANDNP \|\| BC.getOpcode() == X86ISD::FANDN) {
	CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
	DAG.getBitcast(OpVT, BC.getOperand(0)),
	DAG.getBitcast(OpVT, BC.getOperand(1)));
	}

	// If every element is an all-sign value, see if we can use MOVMSK to
	// more efficiently extract the sign bits and compare that.
	// TODO: Handle TESTC with comparison inversion.
	// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
	// MOVMSK combines to make sure its never worse than PTEST?
	unsigned EltBits = BCVT.getScalarSizeInBits();
	if (DAG.ComputeNumSignBits(BC) == EltBits) {
	assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
	APInt SignMask = APInt::getSignMask(EltBits);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (SDValue Res =
	TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
	// For vXi16 cases we need to use pmovmksb and extract every other
	// sign bit.
	SDLoc DL(EFLAGS);
	if (EltBits == 16) {
	MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
	Res = DAG.getBitcast(MovmskVT, Res);
	Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
	Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
	DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
	} else {
	Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
	}
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
	DAG.getConstant(0, DL, MVT::i32));
	}
	}
	}

	// TESTZ(-1,X) == TESTZ(X,X)
	if (ISD::isBuildVectorAllOnes(Op0.getNode()))
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

	// TESTZ(X,-1) == TESTZ(X,X)
	if (ISD::isBuildVectorAllOnes(Op1.getNode()))
	return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
	}

	return SDValue();
	}

	// Attempt to simplify the MOVMSK input based on the comparison type.
	static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Handle eq/ne against zero (any_of).
	// Handle eq/ne against -1 (all_of).
	if (!(CC == X86::COND_E \|\| CC == X86::COND_NE))
	return SDValue();
	if (EFLAGS.getValueType() != MVT::i32)
	return SDValue();
	unsigned CmpOpcode = EFLAGS.getOpcode();
	if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
	return SDValue();
	auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
	if (!CmpConstant)
	return SDValue();
	const APInt &CmpVal = CmpConstant->getAPIntValue();

	SDValue CmpOp = EFLAGS.getOperand(0);
	unsigned CmpBits = CmpOp.getValueSizeInBits();
	assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");

	// Peek through any truncate.
	if (CmpOp.getOpcode() == ISD::TRUNCATE)
	CmpOp = CmpOp.getOperand(0);

	// Bail if we don't find a MOVMSK.
	if (CmpOp.getOpcode() != X86ISD::MOVMSK)
	return SDValue();

	SDValue Vec = CmpOp.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	assert((VecVT.is128BitVector() \|\| VecVT.is256BitVector()) &&
	"Unexpected MOVMSK operand");
	unsigned NumElts = VecVT.getVectorNumElements();
	unsigned NumEltBits = VecVT.getScalarSizeInBits();

	bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
	bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
	CmpVal.isMask(NumElts);
	if (!IsAnyOf && !IsAllOf)
	return SDValue();

	// See if we can peek through to a vector with a wider element type, if the
	// signbits extend down to all the sub-elements as well.
	// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
	// potential SimplifyDemandedBits/Elts cases.
	if (Vec.getOpcode() == ISD::BITCAST) {
	SDValue BC = peekThroughBitcasts(Vec);
	MVT BCVT = BC.getSimpleValueType();
	unsigned BCNumElts = BCVT.getVectorNumElements();
	unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
	if ((BCNumEltBits == 32 \|\| BCNumEltBits == 64) &&
	BCNumEltBits > NumEltBits &&
	DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
	SDLoc DL(EFLAGS);
	unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
	DAG.getConstant(CmpMask, DL, MVT::i32));
	}
	}

	// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
	// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
	if (IsAllOf && Subtarget.hasSSE41()) {
	SDValue BC = peekThroughBitcasts(Vec);
	if (BC.getOpcode() == X86ISD::PCMPEQ &&
	ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
	MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
	SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
	return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
	}
	}

	// See if we can avoid a PACKSS by calling MOVMSK on the sources.
	// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
	// sign bits prior to the comparison with zero unless we know that
	// the vXi16 splats the sign bit down to the lower i8 half.
	// TODO: Handle all_of patterns.
	if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
	SDValue VecOp0 = Vec.getOperand(0);
	SDValue VecOp1 = Vec.getOperand(1);
	bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
	bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
	// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
	if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
	SDLoc DL(EFLAGS);
	SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
	Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
	if (!SignExt0) {
	Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
	DAG.getConstant(0xAAAA, DL, MVT::i16));
	}
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
	DAG.getConstant(0, DL, MVT::i16));
	}
	// PMOVMSKB(PACKSSBW(LO(X), HI(X)))
	// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
	if (CmpBits == 16 && Subtarget.hasInt256() &&
	VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
	VecOp0.getConstantOperandAPInt(1) == 0 &&
	VecOp1.getConstantOperandAPInt(1) == 8 &&
	(IsAnyOf \|\| (SignExt0 && SignExt1))) {
	SDLoc DL(EFLAGS);
	SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
	Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
	if (!SignExt0 \|\| !SignExt1) {
	assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
	Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
	DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
	}
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
	DAG.getConstant(CmpMask, DL, MVT::i32));
	}
	}

	// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
	SmallVector<int, 32> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleInputs;
	if (NumElts == CmpBits &&
	getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
	ShuffleMask, DAG) &&
	ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
	ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
	unsigned NumShuffleElts = ShuffleMask.size();
	APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
	for (int M : ShuffleMask) {
	assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
	DemandedElts.setBit(M);
	}
	if (DemandedElts.isAllOnesValue()) {
	SDLoc DL(EFLAGS);
	SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
	Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	Result =
	DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
	return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
	EFLAGS.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;

	if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
	return R;

	if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
	return R;

	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	// cmov X, X, ?, ? --> X
	if (TrueOp == FalseOp)
	return TrueOp;

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (!(FalseOp.getValueType() == MVT::f80 \|\|
	(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) \|\|
	(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) \|\|
	!Subtarget.hasCMov() \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
	assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
	"Implicit constant truncation");

	bool isFastMultiplier = false;
	if (Diff.ult(10)) {
	switch (Diff.getZExtValue()) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = {FalseOp, Cond.getOperand(0),
	DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp,
	DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
	Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
	// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
	// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
	// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
	if ((CC == X86::COND_NE \|\| CC == X86::COND_E) &&
	Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
	SDValue Add = TrueOp;
	SDValue Const = FalseOp;
	// Canonicalize the condition code for easier matching and output.
	if (CC == X86::COND_E)
	std::swap(Add, Const);

	// We might have replaced the constant in the cmov with the LHS of the
	// compare. If so change it to the RHS of the compare.
	if (Const == Cond.getOperand(0))
	Const = Cond.getOperand(1);

	// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
	if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
	Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
	(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF \|\|
	Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
	Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
	EVT VT = N->getValueType(0);
	// This should constant fold.
	SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
	SDValue CMov =
	DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
	DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
	return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	IsPositive[i] = DAG.SignBitIsZero(Opd);
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = ShrinkMode::MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = ShrinkMode::MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = ShrinkMode::MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = ShrinkMode::MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == ShrinkMode::MULU8 \|\| Mode == ShrinkMode::MULS8)
	return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
	: ISD::SIGN_EXTEND,
	DL, VT, MulLo);

	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi =
	DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, const SDLoc &DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mul1, DL, VT));
	Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
	DAG.getConstant(Mul2, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 41:
	// mul x, 41 => add ((shl (mul x, 5), 3), x)
	return combineMulShlAddOrSub(5, 3, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => add ((shl (mul x, 9), 1), x)
	return combineMulShlAddOrSub(9, 1, /isAdd/ true);
	case 37:
	// mul x, 37 => add ((shl (mul x, 9), 2), x)
	return combineMulShlAddOrSub(9, 2, /isAdd/ true);
	case 73:
	// mul x, 73 => add ((shl (mul x, 9), 3), x)
	return combineMulShlAddOrSub(9, 3, /isAdd/ true);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 23 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 26:
	// mul x, 26 => add ((mul (mul x, 5), 5), x)
	return combineMulMulAddOrSub(5, 5, /isAdd/ true);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(9, 3, /isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(9, 3, /isAdd/ true));
	}

	// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
	// by a single LEA.
	// First check if this a sum of two power of 2s because that's easy. Then
	// count how many zeros are up to the first bit.
	// TODO: We can do this even without LEA at a cost of two shifts and an add.
	if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
	unsigned ScaleShift = countTrailingZeros(MulAmt);
	if (ScaleShift >= 1 && ScaleShift < 4) {
	unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
	SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ScaleShift, DL, MVT::i8));
	return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
	}
	}

	return SDValue();
	}

	// If the upper 17 bits of each element are zero then we can use PMADDWD,
	// which is always at least as quick as PMULLD, except on KNL.
	static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Subtarget.isPMADDWDSlow())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi32 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Make sure the type is legal or will be widened to a legal type.
	if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());

	// Without BWI, we would need to split v32i16.
	if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// If we are zero extending two steps without SSE4.1, its better to reduce
	// the vmul width instead.
	if (!Subtarget.hasSSE41() &&
	(N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
	(N1.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOperand(0).getScalarValueSizeInBits() <= 8))
	return SDValue();

	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (!DAG.MaskedValueIsZero(N1, Mask17) \|\|
	!DAG.MaskedValueIsZero(N0, Mask17))
	return SDValue();

	// Use SplitOpsAndApply to handle AVX splitting.
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
	PMADDWDBuilder);
	}

	static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi64 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i64 \|\|
	VT.getVectorNumElements() < 2 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
	DAG.ComputeNumSignBits(N1) > 32) {
	auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULDQBuilder, /CheckBWI/false);
	}

	// If the upper bits are zero we can use a single pmuludq.
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
	auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULUDQBuilder, /CheckBWI/false);
	}

	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
	return V;

	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	if (isPowerOf2_64(C->getZExtValue()))
	return SDValue();

	int64_t SignMulAmt = C->getSExtValue();
	assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
	uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

	SDLoc DL(N);
	if (AbsMulAmt == 3 \|\| AbsMulAmt == 5 \|\| AbsMulAmt == 9) {
	SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(AbsMulAmt, DL, VT));
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);

	return NewMul;
	}

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((AbsMulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = AbsMulAmt / 9;
	} else if ((AbsMulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = AbsMulAmt / 5;
	} else if ((AbsMulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = AbsMulAmt / 3;
	}

	SDValue NewMul;
	// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\|
	(SignMulAmt >= 0 && (MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)))) {

	if (isPowerOf2_64(MulAmt2) &&
	!(SignMulAmt >= 0 && N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add. Only do this for positive multiply amounts since the
	// negate would prevent it from being used as an address mode anyway.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));

	// Negate the result.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

	if (!NewMul) {
	assert(C->getZExtValue() != 0 &&
	C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	if (isPowerOf2_64(AbsMulAmt - 1)) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
	MVT::i8)));
	// To negate, subtract the number from zero
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), NewMul);
	} else if (isPowerOf2_64(AbsMulAmt + 1)) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 1),
	DL, MVT::i8));
	// To negate, reverse the operands of the subtract.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
	else
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
	// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
	// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	}
	}

	return NewMul;
	}

	// Try to form a MULHU or MULHS node by looking for
	// (srl (mul ext, ext), 16)
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((N->getOpcode() == ISD::SRL \|\| N->getOpcode() == ISD::SRA) &&
	"SRL or SRA node is required here!");
	SDLoc DL(N);

	// Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
	// the multiply.
	if (!Subtarget.hasSSE41())
	return SDValue();

	// The operation feeding into the shift must be a multiply.
	SDValue ShiftOperand = N->getOperand(0);
	if (ShiftOperand.getOpcode() != ISD::MUL \|\| !ShiftOperand.hasOneUse())
	return SDValue();

	// Input type should be at least vXi32.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| VT.getVectorElementType().getSizeInBits() < 32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = ShiftOperand.getOperand(0);
	SDValue RHS = ShiftOperand.getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	EVT MulVT = LHS.getValueType();
	if (MulVT.getVectorElementType() != MVT::i16 \|\| RHS.getValueType() != MulVT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

	ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, DL, VT, Mulh);
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = N0.getConstantOperandAPInt(1);
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->isOne())
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
	return V;

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
	return V;

	// Only do this on the last DAG combine as it can interfere with other
	// combines.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();

	// If this can be matched by a zero extend, don't optimize.
	if (MaskVal.isMask()) {
	unsigned TO = MaskVal.countTrailingOnes();
	if (TO >= 8 && isPowerOf2_32(TO))
	return SDValue();
	}

	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected pack opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumDstElts = VT.getVectorNumElements();

	// Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
	// to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
	// truncation trees that help us avoid lane crossing shuffles.
	// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getConstantOperandAPInt(1) == 0 &&
	N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
	N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
	N0.getOperand(0).getValueType().is256BitVector()) {
	// TODO - support target/faux shuffles.
	SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
	// To keep the PACK LHS/RHS coherency, we must be able to scale the unary
	// shuffle to a vXi64 width - we can probably relax this in the future.
	SmallVector<int, 4> ShuffleMask;
	if (SVN->getOperand(1).isUndef() &&
	scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
	SDLoc DL(N);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
	Lo = DAG.getBitcast(N0.getValueType(), Lo);
	Hi = DAG.getBitcast(N1.getValueType(), Hi);
	SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
	Res = DAG.getBitcast(MVT::v4i32, Res);
	Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
	return DAG.getBitcast(VT, Res);
	}
	}
	}

	// Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
	// TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
	if (VT.is256BitVector()) {
	if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
	if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
	SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
	if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
	scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
	SDValue Op00 = SVN0->getOperand(0);
	SDValue Op01 = SVN0->getOperand(1);
	SDValue Op10 = SVN1->getOperand(0);
	SDValue Op11 = SVN1->getOperand(1);
	if ((Op00 == Op11) && (Op01 == Op10)) {
	std::swap(Op10, Op11);
	ShuffleVectorSDNode::commuteMask(ShuffleMask1);
	}
	if ((Op00 == Op10) && (Op01 == Op11)) {
	SmallVector<int, 4> ShuffleMask;
	ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
	ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
	SDLoc DL(N);
	SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
	return DAG.getBitcast(VT, Res);
	}
	}
	}
	}
	}

	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected pack opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	bool IsSigned = (X86ISD::PACKSS == Opcode);

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0.isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1.isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
	if (SDValue V = combineVectorPackWithShuffle(N, DAG))
	return V;

	// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
	// truncate to create a larger truncate.
	if (Subtarget.hasAVX512() &&
	N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
	N0.getOperand(0).getValueType() == MVT::v8i32) {
	if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) \|\|
	(!IsSigned &&
	DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
	if (Subtarget.hasVLX())
	return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

	// Widen input to v16i32 so we can truncate that.
	SDLoc dl(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
	N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
	}
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert((X86ISD::VSHL == N->getOpcode() \|\| X86ISD::VSRA == N->getOpcode() \|\|
	X86ISD::VSRL == N->getOpcode()) &&
	"Unexpected shift opcode");
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Detect constant shift amounts.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
	unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
	return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
	EltBits[0].getZExtValue(), DAG);
	}

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");
	assert(N->getOperand(1).getValueType() == MVT::i8 &&
	"Unexpected shift amount type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	unsigned ShiftVal = N->getConstantOperandVal(1);
	if (ShiftVal >= NumBitsPerElt) {
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	ShiftVal = NumBitsPerElt - 1;
	}

	// (shift X, 0) -> X
	if (!ShiftVal)
	return N0;

	// (shift 0, C) -> 0
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	// N0 is all zeros or undef. We guarantee that the bits shifted into the
	// result are all zeros, not undef.
	return DAG.getConstant(0, SDLoc(N), VT);

	// (VSRAI -1, C) -> -1
	if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
	// N0 is all ones or undef. We guarantee that the bits shifted into the
	// result are all ones, not undef.
	return DAG.getConstant(-1, SDLoc(N), VT);

	// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
	if (Opcode == N0.getOpcode()) {
	unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
	unsigned NewShiftVal = ShiftVal + ShiftVal2;
	if (NewShiftVal >= NumBitsPerElt) {
	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	NewShiftVal = NumBitsPerElt - 1;
	}
	return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
	DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	// Undef elements need to fold to 0. It's possible SimplifyDemandedBits
	// created an undef input due to no input bits being demanded, but user
	// still expects 0 in other bits.
	for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
	APInt &Elt = EltBits[i];
	if (UndefElts[i])
	Elt = 0;
	else if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftVal;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftVal);
	else
	Elt.lshrInPlace(ShiftVal);
	}
	// Reset undef elements since they were zeroed above.
	UndefElts = 0;
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) \|\|
	N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
	"Unexpected vector insertion");

	if (N->getOpcode() == X86ISD::PINSRB \|\| N->getOpcode() == X86ISD::PINSRW) {
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);
	}

	// Attempt to combine insertion patterns to a shuffle.
	if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0.getOperand(1);
	SDValue CMP1 = N1.getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::FCMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getTargetConstant(x86cc, DL, MVT::i8));
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
	DAG.getConstant(0, DL, MVT::v16i1),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
	N->getSimpleValueType(0));
	}
	SDValue OnesOrZeroesF =
	DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
	CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDValue X, Y;
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	auto GetNot = [&VT, &DAG](SDValue V) {
	// Basic X = NOT(Y) detection.
	if (SDValue Not = IsNOT(V, DAG))
	return Not;
	// Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
	if (V.getOpcode() == X86ISD::VBROADCAST) {
	SDValue Src = V.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isVector())
	return SDValue();
	if (SDValue Not = IsNOT(Src, DAG))
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
	DAG.getBitcast(SrcVT, Not));
	}
	return SDValue();
	};

	if (SDValue Not = GetNot(N0)) {
	X = Not;
	Y = N1;
	} else if (SDValue Not = GetNot(N1)) {
	X = Not;
	Y = N0;
	} else
	return SDValue();

	X = DAG.getBitcast(VT, X);
	Y = DAG.getBitcast(VT, Y);
	return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
	}

	// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
	// logical operations, like in the example below.
	// or (and (truncate x, truncate y)),
	// (xor (truncate z, build_vector (constants)))
	// Given a target type \p VT, we generate
	// or (and x, y), (xor z, zext(build_vector (constants)))
	// given x, y and z are of type \p VT. We can do so, if operands are either
	// truncates from VT types, the second operand is a vector of constants or can
	// be recursively promoted.
	static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
	unsigned Depth) {
	// Limit recursion to avoid excessive compile times.
	if (Depth >= SelectionDAG::MaxRecursionDepth)
	return SDValue();

	if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
	N->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
	return SDValue();

	if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
	N0 = NN0;
	else {
	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0.getOperand(0).getValueType() != VT)
	return SDValue();

	N0 = N0.getOperand(0);
	}

	if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
	N1 = NN1;
	else {
	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	if (RHSTrunc)
	N1 = N1.getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
	}

	return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	SDLoc DL(N);
	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	// Generate the wide operation.
	SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
	if (!Op)
	return SDValue();
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
	unsigned FPOpcode;
	switch (Opcode) {
	default: llvm_unreachable("Unexpected input node for FP logic conversion");
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	}
	return FPOpcode;
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (N0.getOpcode() != ISD::BITCAST \|\| N1.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();

	// Ensure that both types are the same and are legal scalar fp types.
	if (N00Type != N10Type \|\|
	!((Subtarget.hasSSE1() && N00Type == MVT::f32) \|\|
	(Subtarget.hasSSE2() && N00Type == MVT::f64)))
	return SDValue();

	unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}

	// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
	// to reduce XMM->GPR traffic.
	static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
	unsigned Opc = N->getOpcode();
	assert((Opc == ISD::OR \|\| Opc == ISD::AND \|\| Opc == ISD::XOR) &&
	"Unexpected bit opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Both operands must be single use MOVMSK.
	if (N0.getOpcode() != X86ISD::MOVMSK \|\| !N0.hasOneUse() \|\|
	N1.getOpcode() != X86ISD::MOVMSK \|\| !N1.hasOneUse())
	return SDValue();

	SDValue Vec0 = N0.getOperand(0);
	SDValue Vec1 = N1.getOperand(0);
	EVT VecVT0 = Vec0.getValueType();
	EVT VecVT1 = Vec1.getValueType();

	// Both MOVMSK operands must be from vectors of the same size and same element
	// size, but its OK for a fp/int diff.
	if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() \|\|
	VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
	return SDValue();

	SDLoc DL(N);
	unsigned VecOpc =
	VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
	SDValue Result =
	DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	// Don't prevent creation of ANDN.
	if (isBitwiseNot(Op0))
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
	Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
	// Turn it into series of XORs and a setnp.
	static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// We only support 64-bit and 32-bit. 64-bit requires special handling
	// unless the 64-bit popcnt instruction is legal.
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// LHS needs to be a single use CTPOP.
	if (N0.getOpcode() != ISD::CTPOP \|\| !N0.hasOneUse())
	return SDValue();

	// RHS needs to be 1.
	if (!isOneConstant(N1))
	return SDValue();

	SDLoc DL(N);
	SDValue X = N0.getOperand(0);

	// If this is 64-bit, its always best to xor the two 32-bit pieces together
	// even if we have popcnt.
	if (VT == MVT::i64) {
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(32, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
	X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
	// Generate a 32-bit parity idiom. This will bring us back here if we need
	// to expand it too.
	SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
	DAG.getConstant(1, DL, MVT::i32));
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
	}
	assert(VT == MVT::i32 && "Unexpected VT!");

	// Xor the high and low 16-bits together using a 32-bit operation.
	SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(16, DL, MVT::i8));
	X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);

	// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
	// This should allow an h-reg to be used to save a shift.
	// FIXME: We only get an h-reg in 32-bit mode.
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(8, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
	SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
	SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

	// Copy the inverse of the parity flag into a register with setcc.
	SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
	// Zero extend to original type.
	return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
	}


	// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
	// Where C is a mask containing the same number of bits as the setcc and
	// where the setcc will freely 0 upper bits of k-register. We can replace the
	// undef in the concat with 0s and remove the AND. This mainly helps with
	// v2i1/v4i1 setcc being casted to scalar.
	static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

	EVT VT = N->getValueType(0);

	// Make sure this is an AND with constant. We will check the value of the
	// constant later.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	// This is implied by the ConstantSDNode.
	assert(!VT.isVector() && "Expected scalar VT!");

	if (N->getOperand(0).getOpcode() != ISD::BITCAST \|\|
	!N->getOperand(0).hasOneUse() \|\|
	!N->getOperand(0).getOperand(0).hasOneUse())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Src = N->getOperand(0).getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isVector() \|\| SrcVT.getVectorElementType() != MVT::i1 \|\|
	!TLI.isTypeLegal(SrcVT))
	return SDValue();

	if (Src.getOpcode() != ISD::CONCAT_VECTORS)
	return SDValue();

	// We only care about the first subvector of the concat, we expect the
	// other subvectors to be ignored due to the AND if we make the change.
	SDValue SubVec = Src.getOperand(0);
	EVT SubVecVT = SubVec.getValueType();

	// First subvector should be a setcc with a legal result type. The RHS of the
	// AND should be a mask with this many bits.
	if (SubVec.getOpcode() != ISD::SETCC \|\| !TLI.isTypeLegal(SubVecVT) \|\|
	!N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
	return SDValue();

	EVT SetccVT = SubVec.getOperand(0).getValueType();
	if (!TLI.isTypeLegal(SetccVT) \|\|
	!(Subtarget.hasVLX() \|\| SetccVT.is512BitVector()))
	return SDValue();

	if (!(Subtarget.hasBWI() \|\| SetccVT.getScalarSizeInBits() >= 32))
	return SDValue();

	// We passed all the checks. Rebuild the concat_vectors with zeroes
	// and cast it back to VT.
	SDLoc dl(N);
	SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
	DAG.getConstant(0, dl, SubVecVT));
	Ops[0] = SubVec;
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
	Ops);
	return DAG.getBitcast(VT, Concat);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	// Use a 32-bit and+zext if upper bits known zero.
	if (VT == MVT::i64 && Subtarget.is64Bit() &&
	!isa<ConstantSDNode>(N->getOperand(1))) {
	APInt HiMask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) \|\|
	DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
	SDLoc dl(N);
	SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
	SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
	DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
	}
	}

	// This must be done before legalization has expanded the ctpop.
	if (SDValue V = combineParity(N, DAG, Subtarget))
	return V;

	// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<APInt, 2> SrcPartials;
	if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
	Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
	if (Mask) {
	assert(SrcPartials[0].getBitWidth() == NumElts &&
	"Unexpected partial reduction mask");
	SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
	Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
	return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
	}
	}
	}

	if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
	return R;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 1,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
	static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	MVT VT = N->getSimpleValueType(0);
	if (!VT.isVector() \|\| (VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	SDValue N0 = peekThroughBitcasts(N->getOperand(0));
	SDValue N1 = peekThroughBitcasts(N->getOperand(1));
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND)
	return SDValue();

	// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
	// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
	bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) \|\|
	Subtarget.hasVLX();
	if (!(Subtarget.hasXOP() \|\| UseVPTERNLOG \|\|
	!N0.getOperand(1).hasOneUse() \|\| !N1.getOperand(1).hasOneUse()))
	return SDValue();

	// Attempt to extract constant byte masks.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
	false, false))
	return SDValue();
	if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
	false, false))
	return SDValue();

	for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
	// TODO - add UNDEF elts support.
	if (UndefElts0[i] \|\| UndefElts1[i])
	return SDValue();
	if (EltBits0[i] != ~EltBits1[i])
	return SDValue();
	}

	SDLoc DL(N);

	if (UseVPTERNLOG) {
	// Emit a VPTERNLOG node directly.
	SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
	SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
	SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
	SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
	return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
	}

	SDValue X = N->getOperand(0);
	SDValue Y =
	DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
	DAG.getBitcast(VT, N1.getOperand(0)));
	return DAG.getNode(ISD::OR, DL, VT, X, Y);
	}

	// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
	static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
	if (N->getOpcode() != ISD::OR)
	return false;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return false;

	Mask = N1.getOperand(0);
	X = N1.getOperand(1);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	else if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);
	else
	return false;

	// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	return true;
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	SDValue X, Y, Mask;
	if (!matchLogicBlend(N, X, Y, Mask))
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Attempt to combine to conditional negate: (sub (xor X, M), M)
	if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
	DAG, Subtarget))
	return Res;

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
	if (Subtarget.hasVLX())
	return SDValue();

	MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, MVT::i8));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<APInt, 2> SrcPartials;
	if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
	Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
	if (Mask) {
	assert(SrcPartials[0].getBitWidth() == NumElts &&
	"Unexpected partial reduction mask");
	SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
	SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
	Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
	return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
	}
	}
	}

	if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
	return R;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
	return R;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
	// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
	// iff the upper elements of the non-shifted arg are zero.
	// KUNPCK require 16+ bool vector elements.
	if (N0.getOpcode() == X86ISD::KSHIFTL \|\| N1.getOpcode() == X86ISD::KSHIFTL) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfElts = NumElts / 2;
	APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
	if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
	N1.getConstantOperandAPInt(1) == HalfElts &&
	DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
	SDLoc dl(N);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, dl, VT,
	extractSubVector(N0, 0, DAG, dl, HalfElts),
	extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
	}
	if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
	N0.getConstantOperandAPInt(1) == HalfElts &&
	DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
	SDLoc dl(N);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, dl, VT,
	extractSubVector(N1, 0, DAG, dl, HalfElts),
	extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
	}
	}

	// Attempt to recursively combine an OR of shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt =
	isConstOrConstSplat(Shift.getOperand(1), /AllowUndefs/ true);
	if (!ShiftAmt \|\|
	ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
	}

	/// Detect patterns of truncation with unsigned saturation:
	///
	/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value x to be truncated or SDValue() if the pattern was
	/// not matched.
	///
	/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
	/// where C1 >= 0 and C2 is unsigned max of destination type.
	///
	/// (truncate (smax (smin (x, C2), C1)) to dest_type)
	/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
	///
	/// These two patterns are equivalent to:
	/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
	/// So return the smax(x, C1) value to be truncated or SDValue() if the
	/// pattern was not matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) {
	EVT InVT = In.getValueType();

	// Saturation with truncation. We truncate from InVT to VT.
	assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	// Match min/max and return limit value as a parameter.
	auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
	return V.getOperand(0);
	return SDValue();
	};

	APInt C1, C2;
	if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
	// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	if (C2.isMask(VT.getScalarSizeInBits()))
	return UMin;

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
	if (MatchMinMax(SMin, ISD::SMAX, C1))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
	return SMin;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
	C2.uge(C1)) {
	return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
	}

	return SDValue();
	}

	/// Detect patterns of truncation with signed saturation:
	/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
	/// signed_max_of_dest_type)) to dest_type)
	/// or:
	/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
	/// signed_min_of_dest_type)) to dest_type).
	/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
	unsigned NumDstBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = In.getScalarValueSizeInBits();
	assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");

	auto MatchMinMax = [](SDValue V, unsigned Opcode,
	const APInt &Limit) -> SDValue {
	APInt C;
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
	return V.getOperand(0);
	return SDValue();
	};

	APInt SignedMax, SignedMin;
	if (MatchPackUS) {
	SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
	SignedMin = APInt(NumSrcBits, 0);
	} else {
	SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
	SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
	}

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
	if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
	return SMax;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
	return SMin;

	return SDValue();
	}

	static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2() \|\| !VT.isVector())
	return SDValue();

	EVT SVT = VT.getVectorElementType();
	EVT InVT = In.getValueType();
	EVT InSVT = InVT.getVectorElementType();

	// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
	// split across two registers. We can use a packusdw+perm to clamp to 0-65535
	// and concatenate at the same time. Then we can use a final vpmovuswb to
	// clip to 0-255.
	if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
	InVT == MVT::v16i32 && VT == MVT::v16i8) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
	DL, DAG, Subtarget);
	assert(Mid && "Failed to pack!");
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
	}
	}

	// vXi32 truncate instructions are available with AVX512F.
	// vXi16 truncate instructions are only available with AVX512BW.
	// For 256-bit or smaller vectors, we require VLX.
	// FIXME: We could widen truncates to 512 to remove the VLX restriction.
	// If the result type is 256-bits or larger and we have disable 512-bit
	// registers, we should go ahead and use the pack instructions if possible.
	bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) \|\|
	(Subtarget.hasBWI() && InSVT == MVT::i16)) &&
	(InVT.getSizeInBits() > 128) &&
	(Subtarget.hasVLX() \|\| InVT.getSizeInBits() > 256) &&
	!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

	if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
	VT.getSizeInBits() >= 64 &&
	(SVT == MVT::i8 \|\| SVT == MVT::i16) &&
	(InSVT == MVT::i16 \|\| InSVT == MVT::i32)) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
	// Only do this when the result is at least 64 bits or we'll leaving
	// dangling PACKSSDW nodes.
	if (SVT == MVT::i8 && InSVT == MVT::i32) {
	EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
	DAG, Subtarget);
	assert(Mid && "Failed to pack!");
	SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
	Subtarget);
	assert(V && "Failed to pack!");
	return V;
	} else if (SVT == MVT::i8 \|\| Subtarget.hasSSE41())
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
	Subtarget);
	}
	if (auto SSatVal = detectSSatPattern(In, VT))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
	Subtarget);
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
	Subtarget.hasAVX512() && (InSVT != MVT::i16 \|\| Subtarget.hasBWI())) {
	unsigned TruncOpc = 0;
	SDValue SatVal;
	if (auto SSatVal = detectSSatPattern(In, VT)) {
	SatVal = SSatVal;
	TruncOpc = X86ISD::VTRUNCS;
	} else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
	SatVal = USatVal;
	TruncOpc = X86ISD::VTRUNCUS;
	}
	if (SatVal) {
	unsigned ResElts = VT.getVectorNumElements();
	// If the input type is less than 512 bits and we don't have VLX, we need
	// to widen to 512 bits.
	if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
	unsigned NumConcats = 512 / InVT.getSizeInBits();
	ResElts *= NumConcats;
	SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
	ConcatOps[0] = SatVal;
	InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
	NumConcats * InVT.getVectorNumElements());
	SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
	}
	// Widen the result if its narrower than 128 bits.
	if (ResElts * SVT.getSizeInBits() < 128)
	ResElts = 128 / SVT.getSizeInBits();
	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
	SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	NumElems >= 2 && isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
	return !(C->getAPIntValue().ult(Min) \|\| C->getAPIntValue().ugt(Max));
	});
	};

	// Check if each element of the vector is right-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
	};

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Operands[0].getOperand(0), Operands[1] },
	AVGBuilder);
	}

	// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
	// Match the or case only if its 'add-like' - can be replaced by an add.
	auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
	if (ISD::ADD == V.getOpcode()) {
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	}
	if (ISD::ZERO_EXTEND != V.getOpcode())
	return false;
	V = V.getOperand(0);
	if (V.getValueType() != VT \|\| ISD::OR != V.getOpcode() \|\|
	!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
	return false;
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	};

	SDValue Op0, Op1;
	if (FindAddLike(Operands[0], Op0, Op1))
	std::swap(Operands[0], Operands[1]);
	else if (!FindAddLike(Operands[1], Op0, Op1))
	return SDValue();
	Operands[2] = Op0;
	Operands[1] = Op1;

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two can be promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getValueType() != VT) {
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();
	Operands[j] = Operands[j].getOperand(0);
	}

	// The pattern is detected, emit X86ISD::AVG instruction(s).
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
	AVGBuilder);
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
	Ld->getAlignment() >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	*Ld->getMemOperand(), &Fast) &&
	!Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	unsigned HalfOffset = 16;
	SDValue Ptr1 = Ld->getBasePtr();
	SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems / 2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
	Ld->getPointerInfo().getWithOffset(HalfOffset),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1), Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	// Bool vector load - attempt to cast to an integer, as we have good
	// (vXiY *ext(vXi1 bitcast(iX))) handling.
	if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
	RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
	unsigned NumElts = RegVT.getVectorNumElements();
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	if (TLI.isTypeLegal(IntVT)) {
	SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
	return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
	}
	}

	// Cast ptr32 and ptr64 pointers to the default address space before a load.
	unsigned AddrSpace = Ld->getAddressSpace();
	if (AddrSpace == X86AS::PTR64 \|\| AddrSpace == X86AS::PTR32_SPTR \|\|
	AddrSpace == X86AS::PTR32_UPTR) {
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
	SDValue Cast =
	DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
	return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	}
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(ML->isUnindexed() && "Unexpected indexed masked load!");
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	ML->getPassThru(), Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(ML->isUnindexed() && "Unexpected indexed masked load!");
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
	ML->getPassThru());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getPassThru().isUndef())
	return SDValue();

	if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(
	VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
	DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
	ML->getAddressingMode(), ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
	ML->getPassThru());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	auto *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;

	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mld->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	EVT VT = Mld->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	if (SDValue NewMask =
	TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
	return DAG.getMaskedLoad(
	VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
	NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
	Mld->getAddressingMode(), Mld->getExtensionType());
	}

	return SDValue();
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
	if (Mst->isCompressingStore())
	return SDValue();

	EVT VT = Mst->getValue().getValueType();
	SDLoc dl(Mst);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (Mst->isTruncatingStore())
	return SDValue();

	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mst->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	if (SDValue NewMask =
	TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
	Mst->getBasePtr(), Mst->getOffset(), NewMask,
	Mst->getMemoryVT(), Mst->getMemOperand(),
	Mst->getAddressingMode());
	}

	SDValue Value = Mst->getValue();
	if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	Mst->getMemoryVT())) {
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
	Mst->getBasePtr(), Mst->getOffset(), Mask,
	Mst->getMemoryVT(), Mst->getMemOperand(),
	Mst->getAddressingMode(), true);
	}

	return SDValue();
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();
	EVT VT = StoredVal.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert a store of vXi1 into a store of iX and a bitcast.
	if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1) {

	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	StoredVal = DAG.getBitcast(NewVT, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
	// This will avoid a copy to k-register.
	if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
	StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	StoredVal.getOperand(0).getValueType() == MVT::i8) {
	return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
	St->getBasePtr(), St->getPointerInfo(),
	St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// Widen v2i1/v4i1 stores to v8i1.
	if ((VT == MVT::v2i1 \|\| VT == MVT::v4i1) && VT == StVT &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / VT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
	Ops[0] = StoredVal;
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// Turn vXi1 stores of constants into a scalar store.
	if ((VT == MVT::v8i1 \|\| VT == MVT::v16i1 \|\| VT == MVT::v32i1 \|\|
	VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
	ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
	// If its a v64i1 store without 64-bit support, we need two stores.
	if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(0, 32));
	Lo = combinevXi1ConstantToInteger(Lo, DAG);
	SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(32, 32));
	Hi = combinevXi1ConstantToInteger(Hi, DAG);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
	St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Hi, Ptr1,
	St->getPointerInfo().getWithOffset(4),
	St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	// If we are saving a 32-byte vector and 32-byte stores are slow, such as on
	// Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*St->getMemOperand(), &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	return splitVectorStore(St, DAG);
	}

	// Split under-aligned vector non-temporal stores.
	if (St->isNonTemporal() && StVT == VT &&
	St->getAlignment() < VT.getStoreSize()) {
	// ZMM/YMM nt-stores - either it can be stored as a series of shorter
	// vectors or the legalizer can scalarize it to use MOVNTI.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();
	return splitVectorStore(St, DAG);
	}

	// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
	// to use MOVNTI.
	if (VT.is128BitVector() && Subtarget.hasSSE2()) {
	MVT NTVT = Subtarget.hasSSE4A()
	? MVT::v2f64
	: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
	return scalarizeVectorStore(St, NTVT, DAG);
	}
	}

	// Try to optimize v16i16->v16i8 truncating stores when BWI is not
	// supported, but avx512f is by extending to v16i32 and truncating.
	if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
	St->getValue().getOpcode() == ISD::TRUNCATE &&
	St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
	TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
	St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
	return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
	MVT::v16i8, St->getMemOperand());
	}

	// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
	if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
	(StoredVal.getOpcode() == X86ISD::VTRUNCUS \|\|
	StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
	TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
	bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
	return EmitTruncSStore(IsSigned, St->getChain(),
	dl, StoredVal.getOperand(0), St->getBasePtr(),
	VT, St->getMemOperand(), DAG);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (DCI.isBeforeLegalize() \|\| TLI.isTypeLegal(St->getMemoryVT()))
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());

	if (TLI.isTruncStoreLegal(VT, StVT)) {
	if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
	return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
	DAG, dl))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	}

	return SDValue();
	}

	// Cast ptr32 and ptr64 pointers to the default address space before a store.
	unsigned AddrSpace = St->getAddressSpace();
	if (AddrSpace == X86AS::PTR64 \|\| AddrSpace == X86AS::PTR32_SPTR \|\|
	AddrSpace == X86AS::PTR32_UPTR) {
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	if (PtrVT != St->getBasePtr().getSimpleValueType()) {
	SDValue Cast =
	DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
	return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags(), St->getAAInfo());
	}
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
	isa<LoadSDNode>(St->getValue()) &&
	cast<LoadSDNode>(St->getValue())->isSimple() &&
	St->getChain().hasOneUse() && St->isSimple()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// Avoid the transformation if there are multiple uses of the loaded value.
	if (!Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// Lower to a single movq load/store pair.
	SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
	Ld->getBasePtr(), Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getOriginalAlign(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	auto *St = cast<MemIntrinsicSDNode>(N);

	SDValue StoredVal = N->getOperand(1);
	MVT VT = StoredVal.getSimpleValueType();
	EVT MemVT = St->getMemoryVT();

	// Figure out which elements we demand.
	unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
	APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
	KnownZero, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, bool IsCommutative,
	SmallVectorImpl<int> &PostShuffleMask) {
	// If either operand is undef, bail out. The binop should be simplified.
	if (LHS.isUndef() \|\| RHS.isUndef())
	return false;

	// Look for the following pattern:
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	MVT VT = LHS.getSimpleValueType();
	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");
	unsigned NumElts = VT.getVectorNumElements();

	// TODO - can we make a general helper method that does all of this for us?
	auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
	SmallVectorImpl<int> &ShuffleMask) {
	if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!Op.getOperand(0).isUndef())
	N0 = Op.getOperand(0);
	if (!Op.getOperand(1).isUndef())
	N1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
	ShuffleMask.append(Mask.begin(), Mask.end());
	return;
	}
	bool UseSubVector = false;
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op.getOperand(0).getValueType().is256BitVector() &&
	llvm::isNullConstant(Op.getOperand(1))) {
	Op = Op.getOperand(0);
	UseSubVector = true;
	}
	bool IsUnary;
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<int, 16> SrcShuffleMask;
	SDValue BC = peekThroughBitcasts(Op);
	if (isTargetShuffle(BC.getOpcode()) &&
	getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
	SrcOps, SrcShuffleMask, IsUnary)) {
	if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
	SrcOps.size() <= 2) {
	N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
	N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
	ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
	}
	if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
	SrcOps.size() == 1) {
	N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
	N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
	ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
	ShuffleMask.append(Mask.begin(), Mask.end());
	}
	}
	};

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle, then pretend it is the identity shuffle:
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: A default initialized SDValue represents an UNDEF of type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask;
	GetShuffle(LHS, A, B, LMask);

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask;
	GetShuffle(RHS, C, D, RMask);

	// At least one of the operands should be a vector shuffle.
	unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
	if (NumShuffles == 0)
	return false;

	if (LMask.empty()) {
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask.push_back(i);
	}

	if (RMask.empty()) {
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask.push_back(i);
	}

	// Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
	if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
	(isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) \|\|
	isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
	return false;

	// If A and B occur in reverse order in RHS, then canonicalize by commuting
	// RHS operands and shuffle mask.
	if (A != C) {
	std::swap(C, D);
	ShuffleVectorSDNode::commuteMask(RMask);
	}
	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D))
	return false;

	PostShuffleMask.clear();
	PostShuffleMask.append(NumElts, SM_SentinelUndef);

	// LHS and RHS are now:
	// LHS = shuffle A, B, LMask
	// RHS = shuffle A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
	// so we just repeat the inner loop if this is a 256-bit op.
	unsigned Num128BitChunks = VT.getSizeInBits() / 128;
	unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
	unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
	assert((NumEltsPer128BitChunk % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
	for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
	// Ignore undefined components.
	int LIdx = LMask[i + j], RIdx = RMask[i + j];
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// Check that successive odd/even elements are being operated on. If not,
	// this is not a horizontal operation.
	if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
	!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
	return false;

	// Compute the post-shuffle mask index based on where the element
	// is stored in the HOP result, and where it needs to be moved to.
	int Base = LIdx & ~1u;
	int Index = ((Base % NumEltsPer128BitChunk) / 2) +
	((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

	// The low half of the 128-bit result must choose from A.
	// The high half of the 128-bit result must choose from B,
	// unless B is undef. In that case, we are always choosing from A.
	if ((B && Base >= (int)NumElts) \|\| (!B && i >= NumEltsPer64BitChunk))
	Index += NumEltsPer64BitChunk;
	PostShuffleMask[i + j] = Index;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

	bool IsIdentityPostShuffle =
	isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
	if (IsIdentityPostShuffle)
	PostShuffleMask.clear();

	// Assume a SingleSource HOP if we only shuffle one input and don't need to
	// shuffle the result.
	if (!shouldUseHorizontalOp(LHS == RHS &&
	(NumShuffles < 2 \|\| !IsIdentityPostShuffle),
	DAG, Subtarget))
	return false;

	LHS = DAG.getBitcast(VT, LHS);
	RHS = DAG.getBitcast(VT, RHS);
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	SmallVector<int, 8> PostShuffleMask;
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
	SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
	if (!PostShuffleMask.empty())
	HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
	DAG.getUNDEF(VT), PostShuffleMask);
	return HorizBinOp;
	}

	// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.

	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
	/// anything that is guaranteed to be transformed by DAGCombiner.
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned SrcOpcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsFreeTruncation = [VT](SDValue Op) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// See if this has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode = Op.getOpcode();
	if ((Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND \|\|
	Opcode == ISD::ZERO_EXTEND) &&
	Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if this is a single use constant which can be constant folded.
	// NOTE: We don't peek throught bitcasts here because there is currently
	// no support for constant folding truncate+bitcast+vector_of_constants. So
	// we'll just send up with a truncate on both operands which will
	// get turned back into (truncate (binop)) causing an infinite loop.
	return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!Src.hasOneUse())
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (SrcOpcode) {
	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 &&
	TLI.isOperationLegal(SrcOpcode, VT) &&
	!TLI.isOperationLegal(SrcOpcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	case ISD::ADD:
	case ISD::SUB: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate using ISD::AND mask and X86ISD::PACKUS.
	/// e.g. trunc <8 x i32> X to <8 x i16> -->
	/// MaskX = X & 0xffff (clear high bits to prevent saturation)
	/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
	static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);

	APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
	OutVT.getScalarSizeInBits());
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
	return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);
	In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
	DAG.getValueType(OutVT));
	return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);
	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
	if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS/PACKUS.
	if (!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	// Truncation to sub-128bit vXi32 can be better handled with shuffles.
	if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
	return SDValue();

	// AVX512 has fast truncate, but if the input is already going to be split,
	// there's no harm in trying pack.
	if (Subtarget.hasAVX512() &&
	!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
	InVT.is512BitVector()))
	return SDValue();

	unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known = DAG.computeKnownBits(In);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);

	// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
	// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
	// on and combines/simplifications can't then use it.
	if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
	return SDValue();

	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	// Try to form a MULHU or MULHS node by looking for
	// (trunc (srl (mul ext, ext), 16))
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// First instruction should be a right shift of a multiply.
	if (Src.getOpcode() != ISD::SRL \|\|
	Src.getOperand(0).getOpcode() != ISD::MUL)
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Only handle vXi16 types that are at least 128-bits unless they will be
	// widened.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i16)
	return SDValue();

	// Input type should be at least vXi32.
	EVT InVT = Src.getValueType();
	if (InVT.getVectorElementType().getSizeInBits() < 32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = Src.getOperand(0).getOperand(0);
	SDValue RHS = Src.getOperand(0).getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	if (LHS.getValueType() != VT \|\| RHS.getValueType() != VT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	return DAG.getNode(Opc, DL, VT, LHS, RHS);
	}

	// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
	// from one vector with signed bytes from another vector, adds together
	// adjacent pairs of 16-bit products, and saturates the result before
	// truncating to 16-bits.
	//
	// Which looks something like this:
	// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
	// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
	static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !Subtarget.hasSSSE3())
	return SDValue();

	unsigned NumElems = VT.getVectorNumElements();
	EVT ScalarVT = VT.getVectorElementType();
	if (ScalarVT != MVT::i16 \|\| NumElems < 8 \|\| !isPowerOf2_32(NumElems))
	return SDValue();

	SDValue SSatVal = detectSSatPattern(In, VT);
	if (!SSatVal \|\| SSatVal.getOpcode() != ISD::ADD)
	return SDValue();

	// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
	// of multiplies from even/odd elements.
	SDValue N0 = SSatVal.getOperand(0);
	SDValue N1 = SSatVal.getOperand(1);

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// TODO: Handle constant vectors and use knownbits/computenumsignbits?
	// Canonicalize zero_extend to LHS.
	if (N01.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N00, N01);
	if (N11.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N10, N11);

	// Ensure we have a zero_extend and a sign_extend.
	if (N00.getOpcode() != ISD::ZERO_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::ZERO_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Ensure the extend is from vXi8.
	if (N00.getValueType().getVectorElementType() != MVT::i8 \|\|
	N01.getValueType().getVectorElementType() != MVT::i8 \|\|
	N10.getValueType().getVectorElementType() != MVT::i8 \|\|
	N11.getValueType().getVectorElementType() != MVT::i8)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// N00/N10 are zero extended. N01/N11 are sign extended.

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue ZExtIn, SExtIn;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!ZExtIn) {
	ZExtIn = N00In;
	SExtIn = N01In;
	}
	if (ZExtIn != N00In \|\| SExtIn != N01In \|\|
	ZExtIn != N10In \|\| SExtIn != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i8 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
	PMADDBuilder);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to detect PMADD
	if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
	return PMAdd;

	// Try to combine truncation with signed/unsigned saturation.
	if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// Try to combine PMULHUW/PMULHW for vXi16.
	if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
	return V;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	SDLoc DL(N);

	if (auto SSatVal = detectSSatPattern(In, VT))
	return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
	if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
	/// or FSUB(0, x)
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	/// This also recognizes splat of a negated value and returns the splat of that
	/// value.
	static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	// Don't recurse exponentially.
	if (Depth > SelectionDAG::MaxRecursionDepth)
	return SDValue();

	unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	EVT VT = Op->getValueType(0);

	// Make sure the element size doesn't change.
	if (VT.getScalarSizeInBits() != ScalarSize)
	return SDValue();

	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	case ISD::VECTOR_SHUFFLE: {
	// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
	// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
	if (!Op.getOperand(1).isUndef())
	return SDValue();
	if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
	if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
	return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
	cast<ShuffleVectorSDNode>(Op)->getMask());
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
	// -V, INDEX).
	SDValue InsVector = Op.getOperand(0);
	SDValue InsVal = Op.getOperand(1);
	if (!InsVector.isUndef())
	return SDValue();
	if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
	if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
	NegInsVal, Op.getOperand(2));
	break;
	}
	case ISD::FSUB:
	case ISD::XOR:
	case X86ISD::FXOR: {
	SDValue Op1 = Op.getOperand(1);
	SDValue Op0 = Op.getOperand(0);

	// For XOR and FXOR, we want to check if constant
	// bits of Op1 are sign bit masks. For FSUB, we
	// have to check if constant bits of Op0 are sign
	// bit masks and hence we swap the operands.
	if (Opc == ISD::FSUB)
	std::swap(Op0, Op1);

	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	// Extract constant bits and see if they are all
	// sign bit masks. Ignore the undef elements.
	if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
	/* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false)) {
	for (unsigned I = 0, E = EltBits.size(); I < E; I++)
	if (!UndefElts[I] && !EltBits[I].isSignMask())
	return SDValue();

	return peekThroughBitcasts(Op0);
	}
	}
	}

	return SDValue();
	}

	static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
	bool NegRes) {
	if (NegMul) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMADD; break;
	case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMADD: Opcode = ISD::FMA; break;
	case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
	case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
	}
	}

	if (NegAcc) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FMSUB; break;
	case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = ISD::FMA; break;
	case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
	}
	}

	if (NegRes) {
	switch (Opcode) {
	// For accuracy reason, we never combine fneg and fma under strict FP.
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	}
	}

	return Opcode;
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(DAG, N);
	if (!Arg)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();
	if (SDValue NegArg =
	TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
	return DAG.getBitcast(OrigVT, NegArg);

	return SDValue();
	}

	SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations,
	bool ForCodeSize,
	NegatibleCost &Cost,
	unsigned Depth) const {
	// fneg patterns are removable even if they have multiple uses.
	if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
	Cost = NegatibleCost::Cheaper;
	return DAG.getBitcast(Op.getValueType(), Arg);
	}

	EVT VT = Op.getValueType();
	EVT SVT = VT.getScalarType();
	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	case ISD::FMA:
	case X86ISD::FMSUB:
	case X86ISD::FNMADD:
	case X86ISD::FNMSUB:
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB_RND: {
	if (!Op.hasOneUse() \|\| !Subtarget.hasAnyFMA() \|\| !isTypeLegal(VT) \|\|
	!(SVT == MVT::f32 \|\| SVT == MVT::f64) \|\|
	!isOperationLegal(ISD::FMA, VT))
	break;

	// This is always negatible for free but we might be able to remove some
	// extra operand negations as well.
	SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
	for (int i = 0; i != 3; ++i)
	NewOps[i] = getCheaperNegatedExpression(
	Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

	bool NegA = !!NewOps[0];
	bool NegB = !!NewOps[1];
	bool NegC = !!NewOps[2];
	unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

	Cost = (NegA \|\| NegB \|\| NegC) ? NegatibleCost::Cheaper
	: NegatibleCost::Neutral;

	// Fill in the non-negated ops with the original values.
	for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
	if (!NewOps[i])
	NewOps[i] = Op.getOperand(i);
	return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
	}
	case X86ISD::FRCP:
	if (SDValue NegOp0 =
	getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
	ForCodeSize, Cost, Depth + 1))
	return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
	break;
	}

	return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
	ForCodeSize, Cost, Depth);
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (!VT.isVector() \|\| !Subtarget.hasSSE2())
	return SDValue();

	SDLoc dl(N);

	unsigned IntBits = VT.getScalarSizeInBits();
	MVT IntSVT = MVT::getIntegerVT(IntBits);
	MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	if (!isOneConstant(N->getOperand(1)) \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
	return R;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	return combineFneg(N, DAG, DCI, Subtarget);
	}

	static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	unsigned NumBits = VT.getSizeInBits();

	// TODO - Constant Folding.

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
	if (!DAG.getTarget().Options.NoNaNsFPMath \|\|
	!DAG.getTarget().Options.NoSignedZerosFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(VT.isVector() && TLI.isTypeLegal(VT))))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

	// If we don't have to respect NaN inputs, this is a direct translation to x86
	// min/max instructions.
	if (DAG.getTarget().Options.NoNaNsFPMath \|\| N->getFlags().hasNoNaNs())
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

	// If one of the operands is known non-NaN use the native min/max instructions
	// with the non-NaN input as second operand.
	if (DAG.isKnownNeverNaN(Op1))
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
	if (DAG.isKnownNeverNaN(Op0))
	return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

	// If we have to respect NaN inputs, this takes at least 3 instructions.
	// Favor a library call when operating on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getIntegerVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
	SDLoc dl(N);
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	bool IsStrict = N->isTargetStrictFPOpcode();
	EVT VT = N->getValueType(0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(IsStrict ? 1 : 0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(In);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getFloatingPointVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
	SDLoc dl(N);
	if (IsStrict) {
	SDValue Convert =
	DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
	{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
	DCI.CombineTo(N, Convert, Convert.getValue(1));
	} else {
	SDValue Convert =
	DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	}
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);

	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Turn ANDNP back to AND if input is inverted.
	if (SDValue Not = IsNOT(N->getOperand(0), DAG))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
	N->getOperand(1));

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);

	if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getLowBitsSet(8, 4);
	if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	// Convert a full vector load into vzload when not all bits are needed.
	if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
	if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
	SDLoc dl(N);
	if (IsStrict) {
	SDValue Convert = DAG.getNode(
	N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
	{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
	DCI.CombineTo(N, Convert, Convert.getValue(1));
	} else {
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
	DAG.getBitcast(MVT::v8i16, VZLoad));
	DCI.CombineTo(N, Convert);
	}

	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	DCI.recursivelyDeleteUnusedNodes(LN);
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	// Try to combine sext_in_reg of a cmov of constants by extending the constants.
	static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	EVT DstVT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

	if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
	return SDValue();

	// Look through single use any_extends / truncs.
	SDValue IntermediateBitwidthOp;
	if ((N0.getOpcode() == ISD::ANY_EXTEND \|\| N0.getOpcode() == ISD::TRUNCATE) &&
	N0.hasOneUse()) {
	IntermediateBitwidthOp = N0;
	N0 = N0.getOperand(0);
	}

	// See if we have a single use cmov.
	if (N0.getOpcode() != X86ISD::CMOV \|\| !N0.hasOneUse())
	return SDValue();

	SDValue CMovOp0 = N0.getOperand(0);
	SDValue CMovOp1 = N0.getOperand(1);

	// Make sure both operands are constants.
	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	SDLoc DL(N);

	// If we looked through an any_extend/trunc above, add one to the constants.
	if (IntermediateBitwidthOp) {
	unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
	CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
	CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
	}

	CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
	CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

	EVT CMovVT = DstVT;
	// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
	if (DstVT == MVT::i16) {
	CMovVT = MVT::i32;
	CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
	CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
	}

	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
	N0.getOperand(2), N0.getOperand(3));

	if (CMovVT != DstVT)
	CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

	return CMov;
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	if (SDValue V = combineSextInRegCmov(N, DAG))
	return V;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	// Attempt to promote any comparison mask ops before moving the
	// SIGN_EXTEND_INREG in the way.
	if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV \|\| !CMovN.hasOneUse())
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	// Only extend to i32 or i64.
	if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
	return SDValue();

	// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
	// are free.
	if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
	return SDValue();

	// If this a zero extend to i64, we should only extend to i32 and use a free
	// zero extend to finish.
	EVT ExtendVT = TargetVT;
	if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
	ExtendVT = MVT::i32;

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

	SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));

	// Finish extending if needed.
	if (ExtendVT != TargetVT)
	Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

	return Res;
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
	} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
	(SclVT == MVT::i8 \|\| SclVT == MVT::i16 \|\| SclVT == MVT::i32)) {
	// If we have register broadcast instructions, use the scalar size as the
	// element type for the shuffle. Then cast to the wider element type. The
	// widened bits won't be used, and this might allow the use of a broadcast
	// load.
	assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
	unsigned Scale = EltSizeInBits / NumElts;
	EVT BroadcastVT =
	EVT::getVectorVT(DAG.getContext(), SclVT, NumElts Scale);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	ShuffleMask.append(NumElts * Scale, 0);
	Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
	Vec = DAG.getBitcast(VT, Vec);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
	}

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
	// result type.
	static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// Only do this combine with AVX512 for vector extends.
	if (!Subtarget.hasAVX512() \|\| !VT.isVector() \|\| N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Only combine legal element types.
	EVT SVT = VT.getVectorElementType();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
	SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
	return SDValue();

	// We can only do this if the vector size in 256 bits or less.
	unsigned Size = VT.getSizeInBits();
	if (Size > 256 && Subtarget.useAVX512Regs())
	return SDValue();

	// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
	// that's the only integer compares with we have.
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	if (ISD::isUnsignedIntSetCC(CC))
	return SDValue();

	// Only do this combine if the extension will be fully consumed by the setcc.
	EVT N00VT = N0.getOperand(0).getValueType();
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (Size != MatchingVecType.getSizeInBits())
	return SDValue();

	SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

	if (N->getOpcode() == ISD::ZERO_EXTEND)
	Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

	return Res;
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
	if (!DCI.isBeforeLegalizeOps() &&
	N0.getOpcode() == X86ISD::SETCC_CARRY) {
	SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
	N0->getOperand(1));
	bool ReplaceOtherUses = !N0.hasOneUse();
	DCI.CombineTo(N, Setcc);
	// Replace other uses with a truncate of the widened setcc_carry.
	if (ReplaceOtherUses) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), Setcc);
	DCI.CombineTo(N0.getNode(), Trunc);
	}

	return SDValue(N, 0);
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	bool IsStrict = N->isStrictFPOpcode() \|\| N->isTargetStrictFPOpcode();

	// Let legalize expand this if it isn't a legal type yet.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(IsStrict ? 1 : 0);
	SDValue B = N->getOperand(IsStrict ? 2 : 1);
	SDValue C = N->getOperand(IsStrict ? 3 : 2);

	auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();
	if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
	CodeSize)) {
	V = NegV;
	return true;
	}
	// Look through extract_vector_elts. If it comes from an FNEG, create a
	// new extract from the FNEG input.
	if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isNullConstant(V.getOperand(1))) {
	SDValue Vec = V.getOperand(0);
	if (SDValue NegV = TLI.getCheaperNegatedExpression(
	Vec, DAG, LegalOperations, CodeSize)) {
	V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
	NegV, V.getOperand(1));
	return true;
	}
	}

	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = invertIfNegative(C);

	if (!NegA && !NegB && !NegC)
	return SDValue();

	unsigned NewOpcode =
	negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

	if (IsStrict) {
	assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
	return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
	{N->getOperand(0), A, B, C});
	} else {
	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool LegalOperations = !DCI.isBeforeLegalizeOps();

	SDValue N2 = N->getOperand(2);

	SDValue NegN2 =
	TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
	if (!NegN2)
	return SDValue();
	unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegN2, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegN2);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
	// FIXME: Is this needed? We don't seem to have any tests for it.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
	N0.getOpcode() == X86ISD::SETCC_CARRY) {
	SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
	N0->getOperand(1));
	bool ReplaceOtherUses = !N0.hasOneUse();
	DCI.CombineTo(N, Setcc);
	// Replace other uses with a truncate of the widened setcc_carry.
	if (ReplaceOtherUses) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), Setcc);
	DCI.CombineTo(N0.getNode(), Trunc);
	}

	return SDValue(N, 0);
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (DCI.isBeforeLegalizeOps())
	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	// TODO: Combine with any target/faux shuffle.
	if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
	VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
	if ((N00.isUndef() \|\| DAG.MaskedValueIsZero(N00, ZeroMask)) &&
	(N01.isUndef() \|\| DAG.MaskedValueIsZero(N01, ZeroMask))) {
	return concatSubVectors(N00, N01, DAG, dl);
	}
	}

	return SDValue();
	}

	/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
	/// recognizable memcmp expansion.
	static bool isOrXorXorTree(SDValue X, bool Root = true) {
	if (X.getOpcode() == ISD::OR)
	return isOrXorXorTree(X.getOperand(0), false) &&
	isOrXorXorTree(X.getOperand(1), false);
	if (Root)
	return false;
	return X.getOpcode() == ISD::XOR;
	}

	/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
	/// expansion.
	template<typename F>
	static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
	EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
	SDValue Op0 = X.getOperand(0);
	SDValue Op1 = X.getOperand(1);
	if (X.getOpcode() == ISD::OR) {
	SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
	SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
	if (VecVT != CmpVT)
	return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
	if (HasPT)
	return DAG.getNode(ISD::OR, DL, VecVT, A, B);
	return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
	} else if (X.getOpcode() == ISD::XOR) {
	SDValue A = SToV(Op0);
	SDValue B = SToV(Op1);
	if (VecVT != CmpVT)
	return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
	if (HasPT)
	return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
	return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
	}
	llvm_unreachable("Impossible");
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
	if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
	return SDValue();

	// Don't perform this combine if constructing the vector will be expensive.
	auto IsVectorBitCastCheap = [](SDValue X) {
	X = peekThroughBitcasts(X);
	return isa<ConstantSDNode>(X) \|\| X.getValueType().isVector() \|\|
	X.getOpcode() == ISD::LOAD;
	};
	if ((!IsVectorBitCastCheap(X) \|\| !IsVectorBitCastCheap(Y)) &&
	!IsOrXorXorTreeCCZero)
	return SDValue();

	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);

	// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
	// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
	// Otherwise use PCMPEQ (plus AND) and mask testing.
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX()) \|\|
	(OpSize == 512 && Subtarget.useAVX512Regs())) {
	bool HasPT = Subtarget.hasSSE41();

	// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
	// vector registers are essentially free. (Technically, widening registers
	// prevents load folding, but the tradeoff is worth it.)
	bool PreferKOT = Subtarget.preferMaskRegisters();
	bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

	EVT VecVT = MVT::v16i8;
	EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
	if (OpSize == 256) {
	VecVT = MVT::v32i8;
	CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
	}
	EVT CastVT = VecVT;
	bool NeedsAVX512FCast = false;
	if (OpSize == 512 \|\| NeedZExt) {
	if (Subtarget.hasBWI()) {
	VecVT = MVT::v64i8;
	CmpVT = MVT::v64i1;
	if (OpSize == 512)
	CastVT = VecVT;
	} else {
	VecVT = MVT::v16i32;
	CmpVT = MVT::v16i1;
	CastVT = OpSize == 512 ? VecVT :
	OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
	NeedsAVX512FCast = true;
	}
	}

	auto ScalarToVector = [&](SDValue X) -> SDValue {
	bool TmpZext = false;
	EVT TmpCastVT = CastVT;
	if (X.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue OrigX = X.getOperand(0);
	unsigned OrigSize = OrigX.getScalarValueSizeInBits();
	if (OrigSize < OpSize) {
	if (OrigSize == 128) {
	TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
	X = OrigX;
	TmpZext = true;
	} else if (OrigSize == 256) {
	TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
	X = OrigX;
	TmpZext = true;
	}
	}
	}
	X = DAG.getBitcast(TmpCastVT, X);
	if (!NeedZExt && !TmpZext)
	return X;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
	DAG.getConstant(0, DL, VecVT), X,
	DAG.getVectorIdxConstant(0, DL));
	};

	SDValue Cmp;
	if (IsOrXorXorTreeCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
	} else {
	SDValue VecX = ScalarToVector(X);
	SDValue VecY = ScalarToVector(Y);
	if (VecVT != CmpVT) {
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
	} else if (HasPT) {
	Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
	} else {
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
	}
	}
	// AVX512 should emit a setcc that will lower to kortest.
	if (VecVT != CmpVT) {
	EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
	CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
	return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
	DAG.getConstant(0, DL, KRegVT), CC);
	}
	if (HasPT) {
	SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
	Cmp);
	SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
	X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
	}
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	assert(Cmp.getValueType() == MVT::v16i8 &&
	"Non 128-bit vector on pre-SSE41 target");
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	const SDValue LHS = N->getOperand(0);
	const SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;

	if (VT == MVT::i1 && isNullConstant(RHS)) {
	SDValue X86CC;
	if (SDValue V =
	MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
	return DAG.getNode(ISD::TRUNCATE, DL, VT,
	DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
	}
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Using temporaries to avoid messing up operand ordering for later
	// transformations if this doesn't work.
	SDValue Op0 = LHS;
	SDValue Op1 = RHS;
	ISD::CondCode TmpCC = CC;
	// Put build_vector on the right.
	if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(Op0, Op1);
	TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
	}

	bool IsSEXT0 =
	(Op0.getOpcode() == ISD::SIGN_EXTEND) &&
	(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == Op0.getOperand(0).getValueType() &&
	"Unexpected operand type");
	if (TmpCC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (TmpCC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (TmpCC == ISD::SETEQ \|\| TmpCC == ISD::SETGE)
	return DAG.getNOT(DL, Op0.getOperand(0), VT);

	assert((TmpCC == ISD::SETNE \|\| TmpCC == ISD::SETLT) &&
	"Unexpected condition code!");
	return Op0.getOperand(0);
	}
	}

	// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
	// pre-promote its result type since vXi1 vectors don't get promoted
	// during type legalization.
	// NOTE: The element count check is to ignore operand types that need to
	// go through type promotion to a 128-bit vector.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(OpVT.getVectorElementType() == MVT::i8 \|\|
	OpVT.getVectorElementType() == MVT::i16)) {
	SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = N->getSimpleValueType(0);
	unsigned NumBits = VT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// Perform constant folding.
	if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
	assert(VT == MVT::i32 && "Unexpected result type");
	APInt Imm(32, 0);
	for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
	if (!Src.getOperand(Idx).isUndef() &&
	Src.getConstantOperandAPInt(Idx).isNegative())
	Imm.setBit(Idx);
	}
	return DAG.getConstant(Imm, SDLoc(N), VT);
	}

	// Look through int->fp bitcasts that don't change the element width.
	unsigned EltWidth = SrcVT.getScalarSizeInBits();
	if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
	return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

	// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
	// with scalar comparisons.
	if (SDValue NotSrc = IsNOT(Src, DAG)) {
	SDLoc DL(N);
	APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
	NotSrc = DAG.getBitcast(SrcVT, NotSrc);
	return DAG.getNode(ISD::XOR, DL, VT,
	DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
	DAG.getConstant(NotMask, DL, VT));
	}

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// With vector masks we only demand the upper bit of the mask.
	SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
	SDValue Index, SDValue Base, SDValue Scale,
	SelectionDAG &DAG) {
	SDLoc DL(GorS);

	if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
	SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
	Gather->getMask(), Base, Index, Scale } ;
	return DAG.getMaskedGather(Gather->getVTList(),
	Gather->getMemoryVT(), DL, Ops,
	Gather->getMemOperand(),
	Gather->getIndexType());
	}
	auto *Scatter = cast<MaskedScatterSDNode>(GorS);
	SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
	Scatter->getMask(), Base, Index, Scale };
	return DAG.getMaskedScatter(Scatter->getVTList(),
	Scatter->getMemoryVT(), DL,
	Ops, Scatter->getMemOperand(),
	Scatter->getIndexType());
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDLoc DL(N);
	auto *GorS = cast<MaskedGatherScatterSDNode>(N);
	SDValue Index = GorS->getIndex();
	SDValue Base = GorS->getBasePtr();
	SDValue Scale = GorS->getScale();

	if (DCI.isBeforeLegalize()) {
	unsigned IndexWidth = Index.getScalarValueSizeInBits();

	// Shrink constant indices if they are larger than 32-bits.
	// Only do this before legalize types since v2i64 could become v2i32.
	// FIXME: We could check that the type is legal if we're after legalize
	// types, but then we would need to construct test cases where that happens.
	// FIXME: We could support more than just constant vectors, but we need to
	// careful with costing. A truncate that can be optimized out would be fine.
	// Otherwise we might only want to create a truncate if it avoids a split.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
	if (BV->isConstant() && IndexWidth > 32 &&
	DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
	unsigned NumElts = Index.getValueType().getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
	return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
	}
	}

	// Shrink any sign/zero extends from 32 or smaller to larger than 32 if
	// there are sufficient sign bits. Only do this before legalize types to
	// avoid creating illegal types in truncate.
	if ((Index.getOpcode() == ISD::SIGN_EXTEND \|\|
	Index.getOpcode() == ISD::ZERO_EXTEND) &&
	IndexWidth > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
	DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
	unsigned NumElts = Index.getValueType().getVectorNumElements();
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
	return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
	}
	}

	if (DCI.isBeforeLegalizeOps()) {
	unsigned IndexWidth = Index.getScalarValueSizeInBits();

	// Make sure the index is either i32 or i64
	if (IndexWidth != 32 && IndexWidth != 64) {
	MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
	}
	}

	// With vector masks we only demand the upper bit of the mask.
	SDValue Mask = GorS->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
	if (N->getOpcode() != ISD::DELETED_NODE)
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	// TODO: Could we move this to DAGCombine?
	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
	// to optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	bool IsStrict = N->isStrictFPOpcode();
	unsigned NumEltBits = VT.getScalarSizeInBits();
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	if (!VT.isVector() \|\| Op0.getOpcode() != ISD::AND \|\|
	DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits \|\|
	VT.getSizeInBits() != Op0.getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst;
	if (IsStrict)
	SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
	{N->getOperand(0), SDValue(BV, 0)});
	else
	SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
	MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	if (IsStrict)
	return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
	return Res;
	}

	return SDValue();
	}

	/// If we are converting a value to floating-point, try to replace scalar
	/// truncate of an extracted vector element with a bitcast. This tries to keep
	/// the sequence on XMM registers rather than moving between vector and GPRs.
	static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
	// TODO: This is currently only used by combineSIntToFP, but it is generalized
	// to allow being called by any similar cast opcode.
	// TODO: Consider merging this into lowering: vectorizeExtractedCast().
	SDValue Trunc = N->getOperand(0);
	if (!Trunc.hasOneUse() \|\| Trunc.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue ExtElt = Trunc.getOperand(0);
	if (!ExtElt.hasOneUse() \|\| ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(ExtElt.getOperand(1)))
	return SDValue();

	EVT TruncVT = Trunc.getValueType();
	EVT SrcVT = ExtElt.getValueType();
	unsigned DestWidth = TruncVT.getSizeInBits();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	if (SrcWidth % DestWidth != 0)
	return SDValue();

	// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
	EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
	unsigned VecWidth = SrcVecVT.getSizeInBits();
	unsigned NumElts = VecWidth / DestWidth;
	EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
	SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
	SDLoc DL(N);
	SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
	BitcastVec, ExtElt.getOperand(1));
	return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsStrict = N->isStrictFPOpcode();
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), P});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0)) {
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
	{N->getOperand(0), Op0});
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
	}

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	bool IsStrict = N->isStrictFPOpcode();
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), P});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = MVT::i32;
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	if (DCI.isBeforeLegalize() \|\| TruncVT != MVT::v2i32) {
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	if (IsStrict)
	return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
	{N->getOperand(0), Trunc});
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	// If we're after legalize and the type is v2i32 we need to shuffle and
	// use CVTSI2P.
	assert(InVT == MVT::v2i64 && "Unexpected VT!");
	SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
	{ 0, 2, -1, -1 });
	if (IsStrict)
	return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
	{N->getOperand(0), Shuf});
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
	Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	// If we have AVX512DQ we can use packed conversion instructions unless
	// the VT is f80.
	if (Subtarget.hasDQI() && VT != MVT::f80)
	return SDValue();

	if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
	Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
	std::pair<SDValue, SDValue> Tmp =
	Subtarget.getTargetLowering()->BuildFILD(
	VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
	return Tmp.first;
	}
	}

	if (IsStrict)
	return SDValue();

	if (SDValue V = combineToFPTruncExtElt(N, DAG))
	return V;

	return SDValue();
	}

	static bool needCarryOrOverflowFlag(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	X86::CondCode CC;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return true;
	case X86ISD::SETCC:
	case X86ISD::SETCC_CARRY:
	CC = (X86::CondCode)User->getConstantOperandVal(0);
	break;
	case X86ISD::BRCOND:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	case X86ISD::CMOV:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	}

	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	return true;
	}
	}

	return false;
	}

	static bool onlyZeroFlagUsed(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	unsigned CCOpNo;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return false;
	}

	return true;
	}

	static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
	// Only handle test patterns.
	if (!isNullConstant(N->getOperand(1)))
	return SDValue();

	// If we have a CMP of a truncated binop, see if we can make a smaller binop
	// and use its flags directly.
	// TODO: Maybe we should try promoting compares that only use the zero flag
	// first if we can prove the upper bits with computeKnownBits?
	SDLoc dl(N);
	SDValue Op = N->getOperand(0);
	EVT VT = Op.getValueType();

	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if ((Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) &&
	Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
	onlyZeroFlagUsed(SDValue(N, 0))) {
	unsigned BitWidth = VT.getSizeInBits();
	const APInt &ShAmt = Op.getConstantOperandAPInt(1);
	if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
	unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
	APInt Mask = Op.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, MaskBits)
	: APInt::getLowBitsSet(BitWidth, MaskBits);
	if (Mask.isSignedIntN(32)) {
	Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));
	}
	}
	}

	// Look for a truncate with a single use.
	if (Op.getOpcode() != ISD::TRUNCATE \|\| !Op.hasOneUse())
	return SDValue();

	Op = Op.getOperand(0);

	// Arithmetic op can only have one use.
	if (!Op.hasOneUse())
	return SDValue();

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default: return SDValue();
	case ISD::AND:
	// Skip and with constant. We have special handling for and with immediate
	// during isel to generate test instructions.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	NewOpc = X86ISD::AND;
	break;
	case ISD::OR: NewOpc = X86ISD::OR; break;
	case ISD::XOR: NewOpc = X86ISD::XOR; break;
	case ISD::ADD:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::ADD;
	break;
	case ISD::SUB:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::SUB;
	break;
	}

	// We found an op we can narrow. Truncate its inputs.
	SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

	// Use a X86 specific opcode to avoid DAG combine messing with it.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

	// For AND, keep a CMP so that we can match the test pattern.
	if (NewOpc == X86ISD::AND)
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));

	// Return the flags.
	return Op.getValue(1);
	}

	static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert((X86ISD::ADD == N->getOpcode() \|\| X86ISD::SUB == N->getOpcode()) &&
	"Expected X86ISD::ADD or X86ISD::SUB");

	SDLoc DL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

	// If we don't use the flag result, simplify back to a generic ADD/SUB.
	if (!N->hasAnyUseOfValue(1)) {
	SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
	return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
	}

	// Fold any similar generic ADD/SUB opcodes to reuse this node.
	auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
	SDValue Ops[] = {N0, N1};
	SDVTList VTs = DAG.getVTList(N->getValueType(0));
	if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
	SDValue Op(N, 0);
	if (Negate)
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	DCI.CombineTo(GenericAddSub, Op);
	}
	};
	MatchGeneric(LHS, RHS, false);
	MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
	// iff the flag result is dead.
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
	!N->hasAnyUseOfValue(1))
	return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
	Op0.getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 =
	DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> adc X, 0
	// X - SETB Z --> sbb X, 0
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y.getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), NewEFLAGS);
	}
	}

	if (CC == X86::COND_AE) {
	// X + SETAE --> sbb X, -1
	// X - SETAE --> adc X, -1
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(-1, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_BE) {
	// X + SETBE --> sbb X, -1
	// X - SETBE --> adc X, -1
	SDValue EFLAGS = Y.getOperand(1);
	// Try to convert COND_BE into COND_AE in an attempt to facilitate
	// materializing "setae reg".
	//
	// Do not flip "e <= c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(-1, DL, VT), NewEFLAGS);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
	Cmp1.getValue(1));
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
	}

	static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	// Example of pattern we try to detect:
	// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
	//(add (build_vector (extract_elt t, 0),
	// (extract_elt t, 2),
	// (extract_elt t, 4),
	// (extract_elt t, 6)),
	// (build_vector (extract_elt t, 1),
	// (extract_elt t, 3),
	// (extract_elt t, 5),
	// (extract_elt t, 7)))

	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Op0.getOpcode() != ISD::BUILD_VECTOR \|\|
	Op1.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Check if one of Op0,Op1 is of the form:
	// (build_vector (extract_elt Mul, 0),
	// (extract_elt Mul, 2),
	// (extract_elt Mul, 4),
	// ...
	// the other is of the form:
	// (build_vector (extract_elt Mul, 1),
	// (extract_elt Mul, 3),
	// (extract_elt Mul, 5),
	// ...
	// and identify Mul.
	SDValue Mul;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
	SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
	Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
	// TODO: Be more tolerant to undefs.
	if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
	auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
	auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
	auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
	if (!Const0L \|\| !Const1L \|\| !Const0H \|\| !Const1H)
	return SDValue();
	unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
	Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
	// Commutativity of mul allows factors of a product to reorder.
	if (Idx0L > Idx1L)
	std::swap(Idx0L, Idx1L);
	if (Idx0H > Idx1H)
	std::swap(Idx0H, Idx1H);
	// Commutativity of add allows pairs of factors to reorder.
	if (Idx0L > Idx0H) {
	std::swap(Idx0L, Idx0H);
	std::swap(Idx1L, Idx1H);
	}
	if (Idx0L != 2 * i \|\| Idx1L != 2 * i + 1 \|\| Idx0H != 2 * i + 2 \|\|
	Idx1H != 2 * i + 3)
	return SDValue();
	if (!Mul) {
	// First time an extract_elt's source vector is visited. Must be a MUL
	// with 2X number of vector elements than the BUILD_VECTOR.
	// Both extracts must be from same MUL.
	Mul = Op0L->getOperand(0);
	if (Mul->getOpcode() != ISD::MUL \|\|
	Mul.getValueType().getVectorNumElements() != 2 * e)
	return SDValue();
	}
	// Check that the extract is from the same MUL previously seen.
	if (Mul != Op0L->getOperand(0) \|\| Mul != Op1L->getOperand(0) \|\|
	Mul != Op0H->getOperand(0) \|\| Mul != Op1H->getOperand(0))
	return SDValue();
	}

	// Check if the Mul source can be safely shrunk.
	ShrinkMode Mode;
	if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) \|\|
	Mode == ShrinkMode::MULU16)
	return SDValue();

	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements() * 2);
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	EVT InVT = Ops[0].getValueType();
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
	}

	// Attempt to turn this pattern into PMADDWD.
	// (add (mul (sext (build_vector)), (sext (build_vector))),
	// (mul (sext (build_vector)), (sext (build_vector)))
	static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// All inputs need to be sign extends.
	// TODO: Support ZERO_EXTEND from known positive?
	if (N00.getOpcode() != ISD::SIGN_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::SIGN_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Must be extending from vXi16.
	EVT InVT = N00.getValueType();
	if (InVT.getVectorElementType() != MVT::i16 \|\| N01.getValueType() != InVT \|\|
	N10.getValueType() != InVT \|\| N11.getValueType() != InVT)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue In0, In1;
	for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!In0) {
	In0 = N00In;
	In1 = N01In;
	}
	// Mul is commutative so the input vectors can be in any order.
	// Canonicalize to make the compares easier.
	if (In0 != N00In)
	std::swap(N00In, N01In);
	if (In0 != N10In)
	std::swap(N10In, N11In);
	if (In0 != N00In \|\| In1 != N01In \|\| In0 != N10In \|\| In1 != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT OpVT = Ops[0].getValueType();
	assert(OpVT.getScalarType() == MVT::i16 &&
	"Unexpected scalar element type");
	assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	OpVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
	PMADDBuilder);
	}

	static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	bool IsAdd = N->getOpcode() == ISD::ADD;
	assert((IsAdd \|\| N->getOpcode() == ISD::SUB) && "Wrong opcode");

	SmallVector<int, 8> PostShuffleMask;
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
	auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
	Ops[0].getValueType(), Ops);
	};
	SDValue HorizBinOp =
	SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
	if (!PostShuffleMask.empty())
	HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
	DAG.getUNDEF(VT), PostShuffleMask);
	return HorizBinOp;
	}

	return SDValue();
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;
	if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.
	if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
	return V;

	// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
	// (sub Y, (sext (vXi1 X))).
	// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
	// generic DAG combine without a legal type check, but adding this there
	// caused regressions.
	if (VT.isVector()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
	Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
	TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
	SDLoc DL(N);
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
	}

	if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
	Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
	TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
	SDLoc DL(N);
	SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
	}
	}

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	// PSUBUS is supported, starting from SSE2, but truncation for v8i32
	// is only worth it with SSSE3 (PSHUFB).
	EVT EltVT = VT.getVectorElementType();
	if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 \|\| EltVT == MVT::i16)) &&
	!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 \|\| VT == MVT::v8i64)) &&
	!(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR canonicalization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::TRUNCATE &&
	Op1.getOperand(0).getOpcode() == ISD::UMIN &&
	(EltVT == MVT::i8 \|\| EltVT == MVT::i16)) {
	// Special case where the UMIN has been truncated. Try to push the truncate
	// further up. This is similar to the i32/i64 special processing.
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0).getOperand(0);
	SDValue MinRHS = Op1.getOperand(0).getOperand(1);
	EVT TruncVT = Op1.getOperand(0).getValueType();
	if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 \|\|
	TruncVT == MVT::v8i64)) &&
	!(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
	return SDValue();
	SDValue OpToSaturate;
	if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
	MinLHS.getOperand(0) == Op0)
	OpToSaturate = MinRHS;
	else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
	MinRHS.getOperand(0) == Op0)
	OpToSaturate = MinLHS;
	else
	return SDValue();

	// Saturate the non-extended input and then truncate it.
	SDLoc DL(N);
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
	VT.getScalarSizeInBits()),
	DL, TruncVT);
	SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
	SaturationConst);
	SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
	} else
	return SDValue();

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16)
	return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);

	assert((VT == MVT::v8i32 \|\| VT == MVT::v16i32 \|\| VT == MVT::v8i64) &&
	"Unexpected VT!");

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known = DAG.computeKnownBits(SubusLHS);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
	NewSubusLHS, NewSubusRHS);

	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	const APInt &XorC = Op1.getConstantOperandAPInt(1);
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return DAG.getConstant(-1, DL, VT);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return DAG.getConstant(0, DL, VT);
	}

	return SDValue();
	}

	/// Helper that combines an array of subvector ops as if they were the operands
	/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
	/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
	static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
	ArrayRef<SDValue> Ops, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	if (llvm::all_of(Ops, [](SDValue Op) {
	return ISD::isBuildVectorAllZeros(Op.getNode());
	}))
	return getZeroVector(VT, Subtarget, DAG, DL);

	SDValue Op0 = Ops[0];
	bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

	// Fold subvector loads into one.
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
	bool Fast;
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*FirstLd->getMemOperand(), &Fast) &&
	Fast) {
	if (SDValue Ld =
	EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
	return Ld;
	}
	}

	// Repeated subvectors.
	if (IsSplat) {
	// If this broadcast/subv_broadcast is inserted into both halves, use a
	// larger broadcast/subv_broadcast.
	if (Op0.getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
	return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

	// If this broadcast_load is inserted into both halves, use a larger
	// broadcast_load. Update other uses to use an extracted subvector.
	if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
	SDValue BcastLd = DAG.getMemIntrinsicNode(
	X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(
	Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}

	// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
	if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
	(Subtarget.hasAVX2() \|\| MayFoldLoad(Op0.getOperand(0))))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
	Op0.getOperand(0),
	DAG.getIntPtrConstant(0, DL)));

	// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
	if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Subtarget.hasAVX2() \|\|
	(EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
	Op0.getOperand(0).getValueType() == VT.getScalarType())
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

	// concat_vectors(extract_subvector(broadcast(x)),
	// extract_subvector(broadcast(x))) -> broadcast(x)
	if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op0.getOperand(0).getValueType() == VT) {
	if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
	return Op0.getOperand(0);
	}
	}

	// Repeated opcode.
	// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
	// but it currently struggles with different vector widths.
	if (llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOpcode() == Op0.getOpcode();
	})) {
	unsigned NumOps = Ops.size();
	switch (Op0.getOpcode()) {
	case X86ISD::SHUFP: {
	// Add SHUFPD support if/when necessary.
	if (!IsSplat && VT.getScalarType() == MVT::f32 &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOperand(2) == Op0.getOperand(2);
	})) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
	Op0.getOperand(2));
	}
	break;
	}
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFD:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	LLVM_FALLTHROUGH;
	case X86ISD::VPERMILPI:
	// TODO - add support for vXf64/vXi64 shuffles.
	if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 \|\| VT == MVT::v8i32) &&
	Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
	Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
	Op0.getOperand(1));
	return DAG.getBitcast(VT, Res);
	}
	break;
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	if (((VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
	(EltSizeInBits >= 32 \|\| Subtarget.useBWIRegs()))) &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op0.getOperand(1) == Op.getOperand(1);
	})) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	break;
	case X86ISD::VPERMI:
	case X86ISD::VROTLI:
	case X86ISD::VROTRI:
	if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op0.getOperand(1) == Op.getOperand(1);
	})) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	break;
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256()) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumOps * SrcVT.getVectorNumElements());
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
	}
	break;
	case X86ISD::PALIGNR:
	if (!IsSplat &&
	((VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.useBWIRegs())) &&
	llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op0.getOperand(2) == Op.getOperand(2);
	})) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
	Op0.getOperand(2));
	}
	break;
	}
	}

	return SDValue();
	}

	static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Don't do anything for i1 vectors.
	if (VT.getVectorElementType() == MVT::i1)
	return SDValue();

	if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
	SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
	if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
	DCI, Subtarget))
	return R;
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	uint64_t IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (Vec.isUndef() && SubVec.isUndef())
	return DAG.getUNDEF(OpVT);

	// Inserting undefs/zeros into zeros/undefs is a zero vector.
	if ((Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())) &&
	(SubVec.isUndef() \|\| ISD::isBuildVectorAllZeros(SubVec.getNode())))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting into a zero vector and our input was extracted from an
	// insert into a zero vector of the same type and the extraction was at
	// least as large as the original insertion. Just insert the original
	// subvector into a zero vector.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
	isNullConstant(SubVec.getOperand(1)) &&
	SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Ins = SubVec.getOperand(0);
	if (isNullConstant(Ins.getOperand(2)) &&
	ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
	Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	Ins.getOperand(1), N->getOperand(2));
	}
	}

	// Stop here if this is an i1 vector.
	if (IsI1Vector)
	return SDValue();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\|
	!(Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())))) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Match concat_vector style patterns.
	SmallVector<SDValue, 2> SubVectorOps;
	if (collectConcatOps(N, SubVectorOps)) {
	if (SDValue Fold =
	combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
	return Fold;

	// If we're inserting all zeros into the upper half, change this to
	// a concat with zero. We will match this to a move
	// with implicit upper bit zeroing during isel.
	// We do this here because we don't want combineConcatVectorOps to
	// create INSERT_SUBVECTOR from CONCAT_VECTORS.
	if (SubVectorOps.size() == 2 &&
	ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
	}

	// If this is a broadcast insert into an upper undef, use a larger broadcast.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
	return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

	// If this is a broadcast load inserted into an upper undef, use a larger
	// broadcast load.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
	SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
	SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
	SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
	MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}

	return SDValue();
	}

	/// If we are extracting a subvector of a vector select and the select condition
	/// is composed of concatenated vectors, try to narrow the select width. This
	/// is a common pattern for AVX1 integer code because 256-bit selects may be
	/// legal, but there is almost no integer math/logic available for 256-bit.
	/// This function should only be called with legal types (otherwise, the calls
	/// to get simple value types will assert).
	static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
	SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
	SmallVector<SDValue, 4> CatOps;
	if (Sel.getOpcode() != ISD::VSELECT \|\|
	!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
	return SDValue();

	// Note: We assume simple value types because this should only be called with
	// legal operations/types.
	// TODO: This can be extended to handle extraction to 256-bits.
	MVT VT = Ext->getSimpleValueType(0);
	if (!VT.is128BitVector())
	return SDValue();

	MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
	if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
	return SDValue();

	MVT WideVT = Ext->getOperand(0).getSimpleValueType();
	MVT SelVT = Sel.getSimpleValueType();
	assert((SelVT.is256BitVector() \|\| SelVT.is512BitVector()) &&
	"Unexpected vector type with legal operations");

	unsigned SelElts = SelVT.getVectorNumElements();
	unsigned CastedElts = WideVT.getVectorNumElements();
	unsigned ExtIdx = Ext->getConstantOperandVal(1);
	if (SelElts % CastedElts == 0) {
	// The select has the same or more (narrower) elements than the extract
	// operand. The extraction index gets scaled by that factor.
	ExtIdx *= (SelElts / CastedElts);
	} else if (CastedElts % SelElts == 0) {
	// The select has less (wider) elements than the extract operand. Make sure
	// that the extraction index can be divided evenly.
	unsigned IndexDivisor = CastedElts / SelElts;
	if (ExtIdx % IndexDivisor != 0)
	return SDValue();
	ExtIdx /= IndexDivisor;
	} else {
	llvm_unreachable("Element count of simple vector types are not divisible?");
	}

	unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
	unsigned NarrowElts = SelElts / NarrowingFactor;
	MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
	SDLoc DL(Ext);
	SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
	SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
	SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
	SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
	return DAG.getBitcast(VT, NarrowSel);
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// For AVX1 only, if we are extracting from a 256-bit and+not (which will
	// eventually get combined/lowered into ANDNP) with a concatenated operand,
	// split the 'and' into 128-bit ops to avoid the concatenate and extract.
	// We let generic combining take over from there to simplify the
	// insert/extract and 'not'.
	// This pattern emerges during AVX1 legalization. We handle it before lowering
	// to avoid complications like splitting constant vector loads.

	// Capture the original wide type in the likely case that we need to bitcast
	// back to this type.
	if (!N->getValueType(0).isSimple())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = N->getConstantOperandVal(1);
	SDValue InVecBC = peekThroughBitcasts(InVec);
	EVT InVecVT = InVec.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
	TLI.isTypeLegal(InVecVT) &&
	InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
	auto isConcatenatedNot = [] (SDValue V) {
	V = peekThroughBitcasts(V);
	if (!isBitwiseNot(V))
	return false;
	SDValue NotOp = V->getOperand(0);
	return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
	};
	if (isConcatenatedNot(InVecBC.getOperand(0)) \|\|
	isConcatenatedNot(InVecBC.getOperand(1))) {
	// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
	SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
	DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = narrowExtractedVectorSelect(N, DAG))
	return V;

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (VT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), VT);
	return getOnesVector(VT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	VT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

	// If we are extracting from an insert into a zero vector, replace with a
	// smaller insert into zero if we don't access less than the original
	// subvector. Don't do this for i1 vectors.
	if (VT.getVectorElementType() != MVT::i1 &&
	InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
	InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
	ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
	InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
	SDLoc DL(N);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL),
	InVec.getOperand(1), InVec.getOperand(2));
	}

	// If we're extracting from a broadcast then we're better off just
	// broadcasting to the smaller type directly, assuming this is the only use.
	// As its a broadcast we don't care about the extraction index.
	if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
	InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));

	if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
	auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
	if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
	SDValue BcastLd =
	DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
	MemIntr->getMemoryVT(),
	MemIntr->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
	return BcastLd;
	}
	}

	// If we're extracting an upper subvector from a broadcast we should just
	// extract the lowest subvector instead which should allow
	// SimplifyDemandedVectorElts do more simplifications.
	if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST \|\|
	InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
	return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());

	// If we're extracting a broadcasted subvector, just use the source.
	if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
	InVec.getOperand(0).getValueType() == VT)
	return InVec.getOperand(0);

	// Attempt to extract from the source of a shuffle vector.
	if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
	(IdxVal % VT.getVectorNumElements()) == 0) {
	SmallVector<int, 32> ShuffleMask;
	SmallVector<int, 32> ScaledMask;
	SmallVector<SDValue, 2> ShuffleInputs;
	unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
	// Decode the shuffle mask and scale it so its shuffling subvectors.
	if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
	scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
	unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
	if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
	return DAG.getUNDEF(VT);
	if (ScaledMask[SubVecIdx] == SM_SentinelZero)
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
	SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
	if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
	unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
	unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
	return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
	SDLoc(N), VT.getSizeInBits());
	}
	}
	}

	// If we're extracting the lowest subvector and we're the only user,
	// we may be able to perform this with a smaller vector width.
	if (IdxVal == 0 && InVec.hasOneUse()) {
	unsigned InOpcode = InVec.getOpcode();
	if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
	// v2f64 CVTDQ2PD(v4i32).
	if (InOpcode == ISD::SINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTUDQ2PD(v4i32).
	if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTPS2PD(v4f32).
	if (InOpcode == ISD::FP_EXTEND &&
	InVec.getOperand(0).getValueType() == MVT::v4f32) {
	return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
	}
	}
	if ((InOpcode == ISD::ANY_EXTEND \|\|
	InOpcode == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::ZERO_EXTEND \|\|
	InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::SIGN_EXTEND \|\|
	InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	VT.is128BitVector() &&
	InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
	unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
	return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
	}
	if (InOpcode == ISD::VSELECT &&
	InVec.getOperand(0).getValueType().is256BitVector() &&
	InVec.getOperand(1).getValueType().is256BitVector() &&
	InVec.getOperand(2).getValueType().is256BitVector()) {
	SDLoc DL(N);
	SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
	SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
	SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
	return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
	}
	}

	return SDValue();
	}

	static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
	// This occurs frequently in our masked scalar intrinsic code and our
	// floating point select lowering with AVX512.
	// TODO: SimplifyDemandedBits instead?
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->getAPIntValue().isOneValue())
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
	Src.getOperand(0));

	// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
	Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->isNullValue())
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
	Src.getOperand(1));

	// Reduce v2i64 to v4i32 if we don't need the upper bits.
	// TODO: Move to DAGCombine/SimplifyDemandedBits?
	if (VT == MVT::v2i64 \|\| VT == MVT::v2f64) {
	auto IsAnyExt64 = [](SDValue Op) {
	if (Op.getValueType() != MVT::i64 \|\| !Op.hasOneUse())
	return SDValue();
	if (Op.getOpcode() == ISD::ANY_EXTEND &&
	Op.getOperand(0).getScalarValueSizeInBits() <= 32)
	return Op.getOperand(0);
	if (auto *Ld = dyn_cast<LoadSDNode>(Op))
	if (Ld->getExtensionType() == ISD::EXTLOAD &&
	Ld->getMemoryVT().getScalarSizeInBits() <= 32)
	return Op;
	return SDValue();
	};
	if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
	return DAG.getBitcast(
	VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
	DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
	}

	// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
	if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

	return SDValue();
	}

	// Simplify PMULDQ and PMULUDQ operations.
	static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// Canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(RHS))
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

	// Multiply by zero.
	// Don't return RHS as it may contain UNDEFs.
	if (ISD::isBuildVectorAllZeros(RHS.getNode()))
	return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
	return SDValue(N, 0);

	// If the input is an extend_invec and the SimplifyDemandedBits call didn't
	// convert it to any_extend_invec, due to the LegalOperations check, do the
	// conversion directly to a vector shuffle manually. This exposes combine
	// opportunities missed by combineExtInVec not calling
	// combineX86ShufflesRecursively on SSE4.1 targets.
	// FIXME: This is basically a hack around several other issues related to
	// ANY_EXTEND_VECTOR_INREG.
	if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
	(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	LHS.getOperand(0).getValueType() == MVT::v4i32) {
	SDLoc dl(N);
	LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
	LHS.getOperand(0), { 0, -1, 1, -1 });
	LHS = DAG.getBitcast(MVT::v2i64, LHS);
	return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
	}
	if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
	(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	RHS.getOperand(0).getValueType() == MVT::v4i32) {
	SDLoc dl(N);
	RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
	RHS.getOperand(0), { 0, -1, 1, -1 });
	RHS = DAG.getBitcast(MVT::v2i64, RHS);
	return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
	}

	return SDValue();
	}

	static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Try to merge vector loads and extend_inreg to an extload.
	if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
	In.hasOneUse()) {
	auto *Ld = cast<LoadSDNode>(In);
	if (Ld->isSimple()) {
	MVT SVT = In.getSimpleValueType().getVectorElementType();
	ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
	? ISD::SEXTLOAD
	: ISD::ZEXTLOAD;
	EVT MemVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
	if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
	SDValue Load =
	DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT,
	Ld->getOriginalAlign(),
	Ld->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	return Load;
	}
	}
	}

	// Attempt to combine as a shuffle.
	// TODO: SSE41 support
	if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
	SDValue Op(N, 0);
	if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);

	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
	// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
	// extra instructions between the conversion due to going to scalar and back.
	static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C())
	return SDValue();

	if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
	return SDValue();

	if (N->getValueType(0) != MVT::f32 \|\|
	N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
	return SDValue();

	SDLoc dl(N);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
	N->getOperand(0).getOperand(0));
	Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
	DAG.getTargetConstant(4, dl, MVT::i32));
	Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasF16C() \|\| Subtarget.useSoftFloat())
	return SDValue();

	bool IsStrict = N->isStrictFPOpcode();
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(IsStrict ? 1 : 0);
	EVT SrcVT = Src.getValueType();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorElementType() != MVT::f16)
	return SDValue();

	if (VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))
	return SDValue();

	SDLoc dl(N);

	// Convert the input to vXi16.
	EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
	Src = DAG.getBitcast(IntVT, Src);

	// Widen to at least 8 input elements.
	if (NumElts < 8) {
	unsigned NumConcats = 8 / NumElts;
	SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
	: DAG.getConstant(0, dl, IntVT);
	SmallVector<SDValue, 4> Ops(NumConcats, Fill);
	Ops[0] = Src;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
	}

	// Destination is vXf32 with at least 4 elements.
	EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
	std::max(4U, NumElts));
	SDValue Cvt, Chain;
	if (IsStrict) {
	Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
	{N->getOperand(0), Src});
	Chain = Cvt.getValue(1);
	} else {
	Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
	}

	if (NumElts < 4) {
	assert(NumElts == 2 && "Unexpected size");
	Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
	DAG.getIntPtrConstant(0, dl));
	}

	if (IsStrict) {
	// Extend to the original VT if necessary.
	if (Cvt.getValueType() != VT) {
	Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
	{Chain, Cvt});
	Chain = Cvt.getValue(1);
	}
	return DAG.getMergeValues({Cvt, Chain}, dl);
	}

	// Extend to the original VT if necessary.
	return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
	}

	// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
	// cases where the loads have the same input chain and the output chains are
	// unused. This avoids any memory ordering issues.
	static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Only do this if the chain result is unused.
	if (N->hasAnyUseOfValue(1))
	return SDValue();

	auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

	SDValue Ptr = MemIntrin->getBasePtr();
	SDValue Chain = MemIntrin->getChain();
	EVT VT = N->getSimpleValueType(0);
	EVT MemVT = MemIntrin->getMemoryVT();

	// Look at other users of our base pointer and try to find a wider broadcast.
	// The input chain and the size of the memory VT must match.
	for (SDNode *User : Ptr->uses())
	if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
	cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
	cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
	cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
	MemVT.getSizeInBits() &&
	!User->hasAnyUseOfValue(1) &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
	VT.getSizeInBits());
	Extract = DAG.getBitcast(VT, Extract);
	return DCI.CombineTo(N, Extract, SDValue(User, 1));
	}

	return SDValue();
	}

	static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasF16C() \|\| Subtarget.useSoftFloat())
	return SDValue();

	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::f16 \|\|
	SrcVT.getVectorElementType() != MVT::f32)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))
	return SDValue();

	SDLoc dl(N);

	// Widen to at least 4 input elements.
	if (NumElts < 4)
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getConstantFP(0.0, dl, SrcVT));

	// Destination is v8i16 with at least 8 elements.
	EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	std::max(8U, NumElts));
	SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
	DAG.getTargetConstant(4, dl, MVT::i32));

	// Extract down to real number of elements.
	if (NumElts < 8) {
	EVT IntVT = VT.changeVectorElementTypeToInteger();
	Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
	DAG.getIntPtrConstant(0, dl));
	}

	return DAG.getBitcast(VT, Cvt);
	}

	static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
	SDValue Src = N->getOperand(0);

	// Turn MOVDQ2Q+simple_load into an mmx load.
	if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
	LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

	if (LN->isSimple()) {
	SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
	LN->getBasePtr(),
	LN->getPointerInfo(),
	LN->getOriginalAlign(),
	LN->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
	return NewLd;
	}
	}

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::SCALAR_TO_VECTOR:
	return combineScalarToVector(N, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::CONCAT_VECTORS:
	return combineConcatVectors(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case X86ISD::CMP: return combineCMP(N, DAG);
	case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
	case X86ISD::ADD:
	case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL: return combineShiftLeft(N, DAG);
	case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
	case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
	case X86ISD::VEXTRACT_STORE:
	return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::STRICT_SINT_TO_FP:
	return combineSIntToFP(N, DAG, DCI, Subtarget);
	case ISD::UINT_TO_FP:
	case ISD::STRICT_UINT_TO_FP:
	return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::STRICT_CVTTP2SI:
	case X86ISD::CVTTP2SI:
	case X86ISD::STRICT_CVTTP2UI:
	case X86ISD::CVTTP2UI:
	return combineCVTP2I_CVTTP2I(N, DAG, DCI);
	case X86ISD::STRICT_CVTPH2PS:
	case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
	Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHL:
	case X86ISD::VSRA:
	case X86ISD::VSRL:
	return combineVectorShiftVar(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::INSERT_VECTOR_ELT:
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::VALIGN:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB:
	case X86ISD::STRICT_FMSUB:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD:
	case X86ISD::STRICT_FNMADD:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB:
	case X86ISD::STRICT_FNMSUB:
	case X86ISD::FNMSUB_RND:
	case ISD::FMA:
	case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
	case X86ISD::KSHIFTL:
	case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
	case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
	case ISD::STRICT_FP_EXTEND:
	case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
	case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
	case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
	case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
	}

	return SDValue();
	}

	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;

	// There are no vXi8 shifts.
	if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
	return false;

	// TODO: Almost no 8-bit ops are desirable because they have no actual
	// size/speed advantages vs. 32-bit ops, but they do have a major
	// potential disadvantage by causing partial register stalls.
	//
	// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
	// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
	// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
	// check for a constant operand to the multiply.
	if ((Opc == ISD::MUL \|\| Opc == ISD::SHL) && VT == MVT::i8)
	return false;

	// i16 instruction encodings are longer and some i16 instructions are slow,
	// so those are not desirable.
	if (VT == MVT::i16) {
	switch (Opc) {
	default:
	break;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	// Any legal type not explicitly accounted for above here is desirable.
	return true;
	}

	SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
	SDValue Value, SDValue Addr,
	SelectionDAG &DAG) const {
	const Module *M = DAG.getMachineFunction().getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
	if (IsCFProtectionSupported) {
	// In case control-flow branch protection is enabled, we need to add
	// notrack prefix to the indirect branch.
	// In order to do that we create NT_BRIND SDNode.
	// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
	return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
	}

	return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
	}

	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
	isa<ConstantSDNode>(Op.getOperand(1));

	// i16 is legal, but undesirable since i16 instruction encodings are longer
	// and some i16 instructions are slow.
	// 8-bit multiply-by-constant can usually be expanded to something cheaper
	// using LEA and/or other ALU ops.
	if (VT != MVT::i16 && !Is8BitMulByConstant)
	return false;

	auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (!ISD::isNormalStore(User))
	return false;
	auto *Ld = cast<LoadSDNode>(Load);
	auto *St = cast<StoreSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
	if (!Load.hasOneUse() \|\| Load.getOpcode() != ISD::ATOMIC_LOAD)
	return false;
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (User->getOpcode() != ISD::ATOMIC_STORE)
	return false;
	auto *Ld = cast<AtomicSDNode>(Load);
	auto *St = cast<AtomicSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	bool Commute = false;
	switch (Op.getOpcode()) {
	default: return false;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
	return false;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N1) &&
	(!Commute \|\| !isa<ConstantSDNode>(N0) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
	return false;
	if (MayFoldLoad(N0) &&
	((Commute && !isa<ConstantSDNode>(N1)) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
	return false;
	if (IsFoldableAtomicRMW(N0, Op) \|\|
	(Commute && IsFoldableAtomicRMW(N1, Op)))
	return false;
	}
	}

	PVT = MVT::i32;
	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
	X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
	.Case("{@cca}", X86::COND_A)
	.Case("{@ccae}", X86::COND_AE)
	.Case("{@ccb}", X86::COND_B)
	.Case("{@ccbe}", X86::COND_BE)
	.Case("{@ccc}", X86::COND_B)
	.Case("{@cce}", X86::COND_E)
	.Case("{@ccz}", X86::COND_E)
	.Case("{@ccg}", X86::COND_G)
	.Case("{@ccge}", X86::COND_GE)
	.Case("{@ccl}", X86::COND_L)
	.Case("{@ccle}", X86::COND_LE)
	.Case("{@ccna}", X86::COND_BE)
	.Case("{@ccnae}", X86::COND_B)
	.Case("{@ccnb}", X86::COND_AE)
	.Case("{@ccnbe}", X86::COND_A)
	.Case("{@ccnc}", X86::COND_AE)
	.Case("{@ccne}", X86::COND_NE)
	.Case("{@ccnz}", X86::COND_NE)
	.Case("{@ccng}", X86::COND_LE)
	.Case("{@ccnge}", X86::COND_L)
	.Case("{@ccnl}", X86::COND_GE)
	.Case("{@ccnle}", X86::COND_G)
	.Case("{@ccno}", X86::COND_NO)
	.Case("{@ccnp}", X86::COND_P)
	.Case("{@ccns}", X86::COND_NS)
	.Case("{@cco}", X86::COND_O)
	.Case("{@ccp}", X86::COND_P)
	.Case("{@ccs}", X86::COND_S)
	.Default(X86::COND_INVALID);
	return Cond;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'N':
	case 'G':
	case 'L':
	case 'M':
	return C_Immediate;
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return C_Other;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y':
	if (StringRef(constraint).size() != 2)
	break;
	switch (constraint[1]) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) \|\|
	((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'x'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	break;
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	// Lower @cc targets via setcc.
	SDValue X86TargetLowering::LowerAsmOutputForConstraint(
	SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const {
	X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
	if (Cond == X86::COND_INVALID)
	return SDValue();
	// Check that return type is valid.
	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
	OpInfo.ConstraintVT.getSizeInBits() < 8)
	report_fatal_error("Flag output operand is of invalid type");

	// Get EFLAGS register. Only update chain when copyfrom is glued.
	if (Flag.getNode()) {
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
	Chain = Flag.getValue(1);
	} else
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
	// Extract CC code.
	SDValue CC = getSETCC(Cond, Flag, DL, DAG);
	// Extend to 32-bits
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

	return Result;
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
	BooleanContent BCont = getBooleanContents(MVT::i64);
	ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
	: ISD::SIGN_EXTEND;
	int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
	: CST->getSExtValue();
	Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(
	Subtarget.classifyGlobalReference(GA->getGlobal())))
	return;
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	/// Check if \p RC is a mask register class.
	/// I.e., VK* or one of their variant.
	static bool isVKClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::VK1RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK2RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK4RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK8RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK16RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK32RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK64RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// 'A' means [ER]AX + [ER]DX.
	case 'A':
	if (Subtarget.is64Bit())
	return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1RegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16RegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32RegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	if (VT != MVT::f80)
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	break;
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	if (VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f80)
	return std::make_pair(0U, &X86::RFP80RegClass);
	break;
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	case MVT::i128:
	if (Subtarget.is64Bit()) {
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	}
	break;
	// Vector types and fp128.
	case MVT::f128:
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	if (Subtarget.hasAVX())
	return std::make_pair(0U, &X86::VR256RegClass);
	break;
	case MVT::v64i8:
	case MVT::v32i16:
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512()) break;
	if (VConstraint)
	return std::make_pair(0U, &X86::VR512RegClass);
	return std::make_pair(0U, &X86::VR512_0_15RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "x", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	if (!Subtarget.hasSSE1()) break;
	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	return std::make_pair(X86::XMM0, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	return std::make_pair(X86::XMM0, &X86::FR64RegClass);
	case MVT::f128:
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (Subtarget.hasAVX())
	return std::make_pair(X86::YMM0, &X86::VR256RegClass);
	break;
	case MVT::v64i8:
	case MVT::v32i16:
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
	break;
	}
	break;
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1WMRegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8WMRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16WMRegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32WMRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	break;
	}
	}

	if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return std::make_pair(0U, &X86::GR32RegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<Register, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' && Constraint[6] == '}') {
	// st(7) is not allocatable and thus not a member of RFP80. Return
	// singleton class in cases where we have a reference to it.
	if (Constraint[4] == '7')
	return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
	return std::make_pair(X86::FP0 + Constraint[4] - '0',
	&X86::RFP80RegClass);
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint))
	return std::make_pair(X86::FP0, &X86::RFP80RegClass);

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint))
	return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

	// dirflag -> DF
	if (StringRef("{dirflag}").equals_lower(Constraint))
	return std::make_pair(X86::DF, &X86::DFCCRRegClass);

	// fpsr -> FPSW
	if (StringRef("{fpsr}").equals_lower(Constraint))
	return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

	return Res;
	}

	// Make sure it isn't a register that requires 64-bit mode.
	if (!Subtarget.is64Bit() &&
	(isFRClass(Res.second) \|\| isGRClass(Res.second)) &&
	TRI->getEncodingValue(Res.first) >= 8) {
	// Register requires REX prefix, but we're in 32-bit mode.
	return std::make_pair(0, nullptr);
	}

	// Make sure it isn't a register that requires AVX512.
	if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
	TRI->getEncodingValue(Res.first) & 0x10) {
	// Register requires EVEX prefix.
	return std::make_pair(0, nullptr);
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
	: nullptr;
	if (Size == 64 && !is64Bit) {
	// Model GCC's behavior here and select a fixed pair of 32-bit
	// registers.
	switch (DestReg) {
	case X86::RAX:
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
	case X86::RDX:
	return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
	case X86::RCX:
	return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
	case X86::RBX:
	return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
	case X86::RSI:
	return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
	case X86::RDI:
	return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
	case X86::RBP:
	return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
	default:
	return std::make_pair(0, nullptr);
	}
	}
	if (RC && RC->contains(DestReg))
	return std::make_pair(DestReg, RC);
	return Res;
	}
	// No register found/type mismatch.
	return std::make_pair(0, nullptr);
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32XRegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
	Res.second = &X86::VR128XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
	Res.second = &X86::VR256XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isVKClass(*Class)) {
	if (VT == MVT::i1)
	Res.second = &X86::VK1RegClass;
	else if (VT == MVT::i8)
	Res.second = &X86::VK8RegClass;
	else if (VT == MVT::i16)
	Res.second = &X86::VK16RegClass;
	else if (VT == MVT::i32)
	Res.second = &X86::VK32RegClass;
	else if (VT == MVT::i64)
	Res.second = &X86::VK64RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%rdx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	Register NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(
	Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns true if stack probing through a function call is requested.
	bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
	return !getStackProbeSymbolName(MF).empty();
	}

	/// Returns true if stack probing through inline assembly is requested.
	bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {

	// No inline stack probe for Windows, they have their own mechanism.
	if (Subtarget.isOSWindows() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return false;

	// If the function specifically requests inline stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
	"inline-asm";

	return false;
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef
	X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// Inline Stack probes disable stack probe call
	if (hasInlineStackProbe(MF))
	return "";

	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}

	unsigned
	X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
	// The default stack probe size is 4096 if the function has no stackprobesize
	// attribute.
	unsigned StackProbeSize = 4096;
	const Function &Fn = MF.getFunction();
	if (Fn.hasFnAttribute("stack-probe-size"))
	Fn.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);
	return StackProbeSize;
	}
	diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
	index d9fb820f7cb5..9524d9a36204 100644
	--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
	+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
	@@ -1,3202 +1,3200 @@
	//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass transforms simple global variables that never have their address
	// taken. If obviously true, it marks read/write globals as constant, deletes
	// variables only stored to, etc.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/IPO/GlobalOpt.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/IPO.h"
	#include "llvm/Transforms/Utils/CtorUtils.h"
	#include "llvm/Transforms/Utils/Evaluator.h"
	#include "llvm/Transforms/Utils/GlobalStatus.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <cassert>
	#include <cstdint>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "globalopt"

	STATISTIC(NumMarked , "Number of globals marked constant");
	STATISTIC(NumUnnamed , "Number of globals marked unnamed_addr");
	STATISTIC(NumSRA , "Number of aggregate globals broken into scalars");
	STATISTIC(NumHeapSRA , "Number of heap objects SRA'd");
	STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
	STATISTIC(NumDeleted , "Number of globals deleted");
	STATISTIC(NumGlobUses , "Number of global uses devirtualized");
	STATISTIC(NumLocalized , "Number of globals localized");
	STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans");
	STATISTIC(NumFastCallFns , "Number of functions converted to fastcc");
	STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
	STATISTIC(NumNestRemoved , "Number of nest attributes removed");
	STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
	STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
	STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
	STATISTIC(NumInternalFunc, "Number of internal functions");
	STATISTIC(NumColdCC, "Number of functions marked coldcc");

	static cl::opt<bool>
	EnableColdCCStressTest("enable-coldcc-stress-test",
	cl::desc("Enable stress test of coldcc by adding "
	"calling conv to all internal functions."),
	cl::init(false), cl::Hidden);

	static cl::opt<int> ColdCCRelFreq(
	"coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
	cl::desc(
	"Maximum block frequency, expressed as a percentage of caller's "
	"entry frequency, for a call site to be considered cold for enabling"
	"coldcc"));

	/// Is this global variable possibly used by a leak checker as a root? If so,
	/// we might not really want to eliminate the stores to it.
	static bool isLeakCheckerRoot(GlobalVariable *GV) {
	// A global variable is a root if it is a pointer, or could plausibly contain
	// a pointer. There are two challenges; one is that we could have a struct
	// the has an inner member which is a pointer. We recurse through the type to
	// detect these (up to a point). The other is that we may actually be a union
	// of a pointer and another type, and so our LLVM type is an integer which
	// gets converted into a pointer, or our type is an [i8 x #] with a pointer
	// potentially contained here.

	if (GV->hasPrivateLinkage())
	return false;

	SmallVector<Type *, 4> Types;
	Types.push_back(GV->getValueType());

	unsigned Limit = 20;
	do {
	Type *Ty = Types.pop_back_val();
	switch (Ty->getTypeID()) {
	default: break;
	case Type::PointerTyID:
	return true;
	case Type::FixedVectorTyID:
	case Type::ScalableVectorTyID:
	if (cast<VectorType>(Ty)->getElementType()->isPointerTy())
	return true;
	break;
	case Type::ArrayTyID:
	Types.push_back(cast<ArrayType>(Ty)->getElementType());
	break;
	case Type::StructTyID: {
	StructType *STy = cast<StructType>(Ty);
	if (STy->isOpaque()) return true;
	for (StructType::element_iterator I = STy->element_begin(),
	E = STy->element_end(); I != E; ++I) {
	Type InnerTy = I;
	if (isa<PointerType>(InnerTy)) return true;
	if (isa<StructType>(InnerTy) \|\| isa<ArrayType>(InnerTy) \|\|
	isa<VectorType>(InnerTy))
	Types.push_back(InnerTy);
	}
	break;
	}
	}
	if (--Limit == 0) return true;
	} while (!Types.empty());
	return false;
	}

	/// Given a value that is stored to a global but never read, determine whether
	/// it's safe to remove the store and the chain of computation that feeds the
	/// store.
	static bool IsSafeComputationToRemove(
	Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
	do {
	if (isa<Constant>(V))
	return true;
	if (!V->hasOneUse())
	return false;
	if (isa<LoadInst>(V) \|\| isa<InvokeInst>(V) \|\| isa<Argument>(V) \|\|
	isa<GlobalValue>(V))
	return false;
	if (isAllocationFn(V, GetTLI))
	return true;

	Instruction *I = cast<Instruction>(V);
	if (I->mayHaveSideEffects())
	return false;
	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
	if (!GEP->hasAllConstantIndices())
	return false;
	} else if (I->getNumOperands() != 1) {
	return false;
	}

	V = I->getOperand(0);
	} while (true);
	}

	/// This GV is a pointer root. Loop over all users of the global and clean up
	/// any that obviously don't assign the global a value that isn't dynamically
	/// allocated.
	static bool
	CleanupPointerRootUsers(GlobalVariable *GV,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
	// A brief explanation of leak checkers. The goal is to find bugs where
	// pointers are forgotten, causing an accumulating growth in memory
	// usage over time. The common strategy for leak checkers is to explicitly
	// allow the memory pointed to by globals at exit. This is popular because it
	// also solves another problem where the main thread of a C++ program may shut
	// down before other threads that are still expecting to use those globals. To
	// handle that case, we expect the program may create a singleton and never
	// destroy it.

	bool Changed = false;

	// If Dead[n].first is the only use of a malloc result, we can delete its
	// chain of computation and the store to the global in Dead[n].second.
	SmallVector<std::pair<Instruction , Instruction >, 32> Dead;

	// Constants can't be pointers to dynamically allocated memory.
	for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
	UI != E;) {
	User U = UI++;
	if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
	Value *V = SI->getValueOperand();
	if (isa<Constant>(V)) {
	Changed = true;
	SI->eraseFromParent();
	} else if (Instruction *I = dyn_cast<Instruction>(V)) {
	if (I->hasOneUse())
	Dead.push_back(std::make_pair(I, SI));
	}
	} else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
	if (isa<Constant>(MSI->getValue())) {
	Changed = true;
	MSI->eraseFromParent();
	} else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
	if (I->hasOneUse())
	Dead.push_back(std::make_pair(I, MSI));
	}
	} else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
	GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
	if (MemSrc && MemSrc->isConstant()) {
	Changed = true;
	MTI->eraseFromParent();
	} else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
	if (I->hasOneUse())
	Dead.push_back(std::make_pair(I, MTI));
	}
	} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
	if (CE->use_empty()) {
	CE->destroyConstant();
	Changed = true;
	}
	} else if (Constant *C = dyn_cast<Constant>(U)) {
	if (isSafeToDestroyConstant(C)) {
	C->destroyConstant();
	// This could have invalidated UI, start over from scratch.
	Dead.clear();
	CleanupPointerRootUsers(GV, GetTLI);
	return true;
	}
	}
	}

	for (int i = 0, e = Dead.size(); i != e; ++i) {
	if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
	Dead[i].second->eraseFromParent();
	Instruction *I = Dead[i].first;
	do {
	if (isAllocationFn(I, GetTLI))
	break;
	Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
	if (!J)
	break;
	I->eraseFromParent();
	I = J;
	} while (true);
	I->eraseFromParent();
	}
	}

	return Changed;
	}

	/// We just marked GV constant. Loop over all users of the global, cleaning up
	/// the obvious ones. This is largely just a quick scan over the use list to
	/// clean up the easy and obvious cruft. This returns true if it made a change.
	static bool CleanupConstantGlobalUsers(
	Value V, Constant Init, const DataLayout &DL,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
	bool Changed = false;
	// Note that we need to use a weak value handle for the worklist items. When
	// we delete a constant array, we may also be holding pointer to one of its
	// elements (or an element of one of its elements if we're dealing with an
	// array of arrays) in the worklist.
	SmallVector<WeakTrackingVH, 8> WorkList(V->user_begin(), V->user_end());
	while (!WorkList.empty()) {
	Value *UV = WorkList.pop_back_val();
	if (!UV)
	continue;

	User *U = cast<User>(UV);

	if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
	if (Init) {
	// Replace the load with the initializer.
	LI->replaceAllUsesWith(Init);
	LI->eraseFromParent();
	Changed = true;
	}
	} else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
	// Store must be unreachable or storing Init into the global.
	SI->eraseFromParent();
	Changed = true;
	} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
	if (CE->getOpcode() == Instruction::GetElementPtr) {
	Constant *SubInit = nullptr;
	if (Init)
	SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
	Changed \|= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
	} else if ((CE->getOpcode() == Instruction::BitCast &&
	CE->getType()->isPointerTy()) \|\|
	CE->getOpcode() == Instruction::AddrSpaceCast) {
	// Pointer cast, delete any stores and memsets to the global.
	Changed \|= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
	}

	if (CE->use_empty()) {
	CE->destroyConstant();
	Changed = true;
	}
	} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
	// Do not transform "gepinst (gep constexpr (GV))" here, because forming
	// "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
	// and will invalidate our notion of what Init is.
	Constant *SubInit = nullptr;
	if (!isa<ConstantExpr>(GEP->getOperand(0))) {
	ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
	ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
	if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
	SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);

	// If the initializer is an all-null value and we have an inbounds GEP,
	// we already know what the result of any load from that GEP is.
	// TODO: Handle splats.
	if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
	SubInit = Constant::getNullValue(GEP->getResultElementType());
	}
	Changed \|= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);

	if (GEP->use_empty()) {
	GEP->eraseFromParent();
	Changed = true;
	}
	} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
	if (MI->getRawDest() == V) {
	MI->eraseFromParent();
	Changed = true;
	}

	} else if (Constant *C = dyn_cast<Constant>(U)) {
	// If we have a chain of dead constantexprs or other things dangling from
	// us, and if they are all dead, nuke them without remorse.
	if (isSafeToDestroyConstant(C)) {
	C->destroyConstant();
	CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
	return true;
	}
	}
	}
	return Changed;
	}

	static bool isSafeSROAElementUse(Value *V);

	/// Return true if the specified GEP is a safe user of a derived
	/// expression from a global that we want to SROA.
	static bool isSafeSROAGEP(User *U) {
	// Check to see if this ConstantExpr GEP is SRA'able. In particular, we
	// don't like < 3 operand CE's, and we don't like non-constant integer
	// indices. This enforces that all uses are 'gep GV, 0, C, ...' for some
	// value of C.
	if (U->getNumOperands() < 3 \|\| !isa<Constant>(U->getOperand(1)) \|\|
	!cast<Constant>(U->getOperand(1))->isNullValue())
	return false;

	gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
	++GEPI; // Skip over the pointer index.

	// For all other level we require that the indices are constant and inrange.
	// In particular, consider: A[0][i]. We cannot know that the user isn't doing
	// invalid things like allowing i to index an out-of-range subscript that
	// accesses A[1]. This can also happen between different members of a struct
	// in llvm IR.
	for (; GEPI != E; ++GEPI) {
	if (GEPI.isStruct())
	continue;

	ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
	if (!IdxVal \|\| (GEPI.isBoundedSequential() &&
	IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
	return false;
	}

	return llvm::all_of(U->users(),
	[](User *UU) { return isSafeSROAElementUse(UU); });
	}

	/// Return true if the specified instruction is a safe user of a derived
	/// expression from a global that we want to SROA.
	static bool isSafeSROAElementUse(Value *V) {
	// We might have a dead and dangling constant hanging off of here.
	if (Constant *C = dyn_cast<Constant>(V))
	return isSafeToDestroyConstant(C);

	Instruction *I = dyn_cast<Instruction>(V);
	if (!I) return false;

	// Loads are ok.
	if (isa<LoadInst>(I)) return true;

	// Stores to the pointer are ok.
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return SI->getOperand(0) != V;

	// Otherwise, it must be a GEP. Check it and its users are safe to SRA.
	return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I);
	}

	/// Look at all uses of the global and decide whether it is safe for us to
	/// perform this transformation.
	static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
	for (User *U : GV->users()) {
	// The user of the global must be a GEP Inst or a ConstantExpr GEP.
	if (!isa<GetElementPtrInst>(U) &&
	(!isa<ConstantExpr>(U) \|\|
	cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
	return false;

	// Check the gep and it's users are safe to SRA
	if (!isSafeSROAGEP(U))
	return false;
	}

	return true;
	}

	static bool IsSRASequential(Type *T) {
	return isa<ArrayType>(T) \|\| isa<VectorType>(T);
	}
	static uint64_t GetSRASequentialNumElements(Type *T) {
	if (ArrayType *AT = dyn_cast<ArrayType>(T))
	return AT->getNumElements();
	return cast<FixedVectorType>(T)->getNumElements();
	}
	static Type GetSRASequentialElementType(Type T) {
	if (ArrayType *AT = dyn_cast<ArrayType>(T))
	return AT->getElementType();
	return cast<VectorType>(T)->getElementType();
	}
	static bool CanDoGlobalSRA(GlobalVariable *GV) {
	Constant *Init = GV->getInitializer();

	if (isa<StructType>(Init->getType())) {
	// nothing to check
	} else if (IsSRASequential(Init->getType())) {
	if (GetSRASequentialNumElements(Init->getType()) > 16 &&
	GV->hasNUsesOrMore(16))
	return false; // It's not worth it.
	} else
	return false;

	return GlobalUsersSafeToSRA(GV);
	}

	/// Copy over the debug info for a variable to its SRA replacements.
	static void transferSRADebugInfo(GlobalVariable GV, GlobalVariable NGV,
	uint64_t FragmentOffsetInBits,
	- uint64_t FragmentSizeInBits) {
	+ uint64_t FragmentSizeInBits,
	+ uint64_t VarSize) {
	SmallVector<DIGlobalVariableExpression *, 1> GVs;
	GV->getDebugInfo(GVs);
	for (auto *GVE : GVs) {
	DIVariable *Var = GVE->getVariable();
	- Optional<uint64_t> VarSize = Var->getSizeInBits();
	-
	DIExpression *Expr = GVE->getExpression();
	// If the FragmentSize is smaller than the variable,
	// emit a fragment expression.
	- // If the variable size is unknown a fragment must be
	- // emitted to be safe.
	- if (!VarSize \|\| FragmentSizeInBits < *VarSize) {
	+ if (FragmentSizeInBits < VarSize) {
	if (auto E = DIExpression::createFragmentExpression(
	Expr, FragmentOffsetInBits, FragmentSizeInBits))
	Expr = *E;
	else
	return;
	}
	auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
	NGV->addDebugInfo(NGVE);
	}
	}

	/// Perform scalar replacement of aggregates on the specified global variable.
	/// This opens the door for other optimizations by exposing the behavior of the
	/// program in a more fine-grained way. We have determined that this
	/// transformation is safe already. We return the first global variable we
	/// insert so that the caller can reprocess it.
	static GlobalVariable SRAGlobal(GlobalVariable GV, const DataLayout &DL) {
	// Make sure this global only has simple uses that we can SRA.
	if (!CanDoGlobalSRA(GV))
	return nullptr;

	assert(GV->hasLocalLinkage());
	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	+ uint64_t VarSize = DL.getTypeSizeInBits(Ty);

	std::map<unsigned, GlobalVariable *> NewGlobals;

	// Get the alignment of the global, either explicit or target-specific.
	Align StartAlignment =
	DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());

	// Loop over all users and create replacement variables for used aggregate
	// elements.
	for (User *GEP : GV->users()) {
	assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
	Instruction::GetElementPtr) \|\|
	isa<GetElementPtrInst>(GEP)) &&
	"NonGEP CE's are not SRAable!");

	// Ignore the 1th operand, which has to be zero or else the program is quite
	// broken (undefined). Get the 2nd operand, which is the structure or array
	// index.
	unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
	if (NewGlobals.count(ElementIdx) == 1)
	continue; // we`ve already created replacement variable
	assert(NewGlobals.count(ElementIdx) == 0);

	Type *ElTy = nullptr;
	if (StructType *STy = dyn_cast<StructType>(Ty))
	ElTy = STy->getElementType(ElementIdx);
	else
	ElTy = GetSRASequentialElementType(Ty);
	assert(ElTy);

	Constant *In = Init->getAggregateElement(ElementIdx);
	assert(In && "Couldn't get element of initializer?");

	GlobalVariable *NGV = new GlobalVariable(
	ElTy, false, GlobalVariable::InternalLinkage, In,
	GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
	GV->getType()->getAddressSpace());
	NGV->setExternallyInitialized(GV->isExternallyInitialized());
	NGV->copyAttributesFrom(GV);
	NewGlobals.insert(std::make_pair(ElementIdx, NGV));

	if (StructType *STy = dyn_cast<StructType>(Ty)) {
	const StructLayout &Layout = *DL.getStructLayout(STy);

	// Calculate the known alignment of the field. If the original aggregate
	// had 256 byte alignment for example, something might depend on that:
	// propagate info to each field.
	uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
	Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
	if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
	NGV->setAlignment(NewAlign);

	// Copy over the debug info for the variable.
	uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
	uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
	- transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size);
	+ transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
	} else {
	uint64_t EltSize = DL.getTypeAllocSize(ElTy);
	Align EltAlign = DL.getABITypeAlign(ElTy);
	uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);

	// Calculate the known alignment of the field. If the original aggregate
	// had 256 byte alignment for example, something might depend on that:
	// propagate info to each field.
	Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
	if (NewAlign > EltAlign)
	NGV->setAlignment(NewAlign);
	transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
	- FragmentSizeInBits);
	+ FragmentSizeInBits, VarSize);
	}
	}

	if (NewGlobals.empty())
	return nullptr;

	Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
	for (auto NewGlobalVar : NewGlobals)
	Globals.push_back(NewGlobalVar.second);

	LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");

	Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));

	// Loop over all of the uses of the global, replacing the constantexpr geps,
	// with smaller constantexpr geps or direct references.
	while (!GV->use_empty()) {
	User *GEP = GV->user_back();
	assert(((isa<ConstantExpr>(GEP) &&
	cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)\|\|
	isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");

	// Ignore the 1th operand, which has to be zero or else the program is quite
	// broken (undefined). Get the 2nd operand, which is the structure or array
	// index.
	unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
	assert(NewGlobals.count(ElementIdx) == 1);

	Value *NewPtr = NewGlobals[ElementIdx];
	Type *NewTy = NewGlobals[ElementIdx]->getValueType();

	// Form a shorter GEP if needed.
	if (GEP->getNumOperands() > 3) {
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
	SmallVector<Constant*, 8> Idxs;
	Idxs.push_back(NullInt);
	for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
	Idxs.push_back(CE->getOperand(i));
	NewPtr =
	ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
	} else {
	GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
	SmallVector<Value*, 8> Idxs;
	Idxs.push_back(NullInt);
	for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
	Idxs.push_back(GEPI->getOperand(i));
	NewPtr = GetElementPtrInst::Create(
	NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
	GEPI);
	}
	}
	GEP->replaceAllUsesWith(NewPtr);

	if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
	GEPI->eraseFromParent();
	else
	cast<ConstantExpr>(GEP)->destroyConstant();
	}

	// Delete the old global, now that it is dead.
	Globals.erase(GV);
	++NumSRA;

	assert(NewGlobals.size() > 0);
	return NewGlobals.begin()->second;
	}

	/// Return true if all users of the specified value will trap if the value is
	/// dynamically null. PHIs keeps track of any phi nodes we've seen to avoid
	/// reprocessing them.
	static bool AllUsesOfValueWillTrapIfNull(const Value *V,
	SmallPtrSetImpl<const PHINode*> &PHIs) {
	for (const User *U : V->users()) {
	if (const Instruction *I = dyn_cast<Instruction>(U)) {
	// If null pointer is considered valid, then all uses are non-trapping.
	// Non address-space 0 globals have already been pruned by the caller.
	if (NullPointerIsDefined(I->getFunction()))
	return false;
	}
	if (isa<LoadInst>(U)) {
	// Will trap.
	} else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
	if (SI->getOperand(0) == V) {
	//cerr << "NONTRAPPING USE: " << *U;
	return false; // Storing the value.
	}
	} else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
	if (CI->getCalledOperand() != V) {
	//cerr << "NONTRAPPING USE: " << *U;
	return false; // Not calling the ptr
	}
	} else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
	if (II->getCalledOperand() != V) {
	//cerr << "NONTRAPPING USE: " << *U;
	return false; // Not calling the ptr
	}
	} else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
	if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
	} else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
	if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
	} else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
	// If we've already seen this phi node, ignore it, it has already been
	// checked.
	if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
	return false;
	} else {
	//cerr << "NONTRAPPING USE: " << *U;
	return false;
	}
	}
	return true;
	}

	/// Return true if all uses of any loads from GV will trap if the loaded value
	/// is null. Note that this also permits comparisons of the loaded value
	/// against null, as a special case.
	static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
	for (const User *U : GV->users())
	if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
	SmallPtrSet<const PHINode*, 8> PHIs;
	if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
	return false;
	} else if (isa<StoreInst>(U)) {
	// Ignore stores to the global.
	} else {
	// We don't know or understand this user, bail out.
	//cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
	return false;
	}
	return true;
	}

	static bool OptimizeAwayTrappingUsesOfValue(Value V, Constant NewV) {
	bool Changed = false;
	for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
	Instruction I = cast<Instruction>(UI++);
	// Uses are non-trapping if null pointer is considered valid.
	// Non address-space 0 globals are already pruned by the caller.
	if (NullPointerIsDefined(I->getFunction()))
	return false;
	if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
	LI->setOperand(0, NewV);
	Changed = true;
	} else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
	if (SI->getOperand(1) == V) {
	SI->setOperand(1, NewV);
	Changed = true;
	}
	} else if (isa<CallInst>(I) \|\| isa<InvokeInst>(I)) {
	CallBase *CB = cast<CallBase>(I);
	if (CB->getCalledOperand() == V) {
	// Calling through the pointer! Turn into a direct call, but be careful
	// that the pointer is not also being passed as an argument.
	CB->setCalledOperand(NewV);
	Changed = true;
	bool PassedAsArg = false;
	for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
	if (CB->getArgOperand(i) == V) {
	PassedAsArg = true;
	CB->setArgOperand(i, NewV);
	}

	if (PassedAsArg) {
	// Being passed as an argument also. Be careful to not invalidate UI!
	UI = V->user_begin();
	}
	}
	} else if (CastInst *CI = dyn_cast<CastInst>(I)) {
	Changed \|= OptimizeAwayTrappingUsesOfValue(CI,
	ConstantExpr::getCast(CI->getOpcode(),
	NewV, CI->getType()));
	if (CI->use_empty()) {
	Changed = true;
	CI->eraseFromParent();
	}
	} else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
	// Should handle GEP here.
	SmallVector<Constant*, 8> Idxs;
	Idxs.reserve(GEPI->getNumOperands()-1);
	for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
	i != e; ++i)
	if (Constant C = dyn_cast<Constant>(i))
	Idxs.push_back(C);
	else
	break;
	if (Idxs.size() == GEPI->getNumOperands()-1)
	Changed \|= OptimizeAwayTrappingUsesOfValue(
	GEPI, ConstantExpr::getGetElementPtr(GEPI->getSourceElementType(),
	NewV, Idxs));
	if (GEPI->use_empty()) {
	Changed = true;
	GEPI->eraseFromParent();
	}
	}
	}

	return Changed;
	}

	/// The specified global has only one non-null value stored into it. If there
	/// are uses of the loaded value that would trap if the loaded value is
	/// dynamically null, then we know that they cannot be reachable with a null
	/// optimize away the load.
	static bool OptimizeAwayTrappingUsesOfLoads(
	GlobalVariable GV, Constant LV, const DataLayout &DL,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
	bool Changed = false;

	// Keep track of whether we are able to remove all the uses of the global
	// other than the store that defines it.
	bool AllNonStoreUsesGone = true;

	// Replace all uses of loads with uses of uses of the stored value.
	for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
	User GlobalUser = GUI++;
	if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
	Changed \|= OptimizeAwayTrappingUsesOfValue(LI, LV);
	// If we were able to delete all uses of the loads
	if (LI->use_empty()) {
	LI->eraseFromParent();
	Changed = true;
	} else {
	AllNonStoreUsesGone = false;
	}
	} else if (isa<StoreInst>(GlobalUser)) {
	// Ignore the store that stores "LV" to the global.
	assert(GlobalUser->getOperand(1) == GV &&
	"Must be storing to the global");
	} else {
	AllNonStoreUsesGone = false;

	// If we get here we could have other crazy uses that are transitively
	// loaded.
	assert((isa<PHINode>(GlobalUser) \|\| isa<SelectInst>(GlobalUser) \|\|
	isa<ConstantExpr>(GlobalUser) \|\| isa<CmpInst>(GlobalUser) \|\|
	isa<BitCastInst>(GlobalUser) \|\|
	isa<GetElementPtrInst>(GlobalUser)) &&
	"Only expect load and stores!");
	}
	}

	if (Changed) {
	LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV
	<< "\n");
	++NumGlobUses;
	}

	// If we nuked all of the loads, then none of the stores are needed either,
	// nor is the global.
	if (AllNonStoreUsesGone) {
	if (isLeakCheckerRoot(GV)) {
	Changed \|= CleanupPointerRootUsers(GV, GetTLI);
	} else {
	Changed = true;
	CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
	}
	if (GV->use_empty()) {
	LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n");
	Changed = true;
	GV->eraseFromParent();
	++NumDeleted;
	}
	}
	return Changed;
	}

	/// Walk the use list of V, constant folding all of the instructions that are
	/// foldable.
	static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
	TargetLibraryInfo *TLI) {
	for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
	if (Instruction I = dyn_cast<Instruction>(UI++))
	if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) {
	I->replaceAllUsesWith(NewC);

	// Advance UI to the next non-I use to avoid invalidating it!
	// Instructions could multiply use V.
	while (UI != E && *UI == I)
	++UI;
	if (isInstructionTriviallyDead(I, TLI))
	I->eraseFromParent();
	}
	}

	/// This function takes the specified global variable, and transforms the
	/// program as if it always contained the result of the specified malloc.
	/// Because it is always the result of the specified malloc, there is no reason
	/// to actually DO the malloc. Instead, turn the malloc into a global, and any
	/// loads of GV as uses of the new global.
	static GlobalVariable *
	OptimizeGlobalAddressOfMalloc(GlobalVariable GV, CallInst CI, Type *AllocTy,
	ConstantInt *NElements, const DataLayout &DL,
	TargetLibraryInfo *TLI) {
	LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << GV << " CALL = " << CI
	<< '\n');

	Type *GlobalType;
	if (NElements->getZExtValue() == 1)
	GlobalType = AllocTy;
	else
	// If we have an array allocation, the global variable is of an array.
	GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());

	// Create the new global variable. The contents of the malloc'd memory is
	// undefined, so initialize with an undef value.
	GlobalVariable *NewGV = new GlobalVariable(
	*GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
	UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
	GV->getThreadLocalMode());

	// If there are bitcast users of the malloc (which is typical, usually we have
	// a malloc + bitcast) then replace them with uses of the new global. Update
	// other users to use the global as well.
	BitCastInst *TheBC = nullptr;
	while (!CI->use_empty()) {
	Instruction *User = cast<Instruction>(CI->user_back());
	if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
	if (BCI->getType() == NewGV->getType()) {
	BCI->replaceAllUsesWith(NewGV);
	BCI->eraseFromParent();
	} else {
	BCI->setOperand(0, NewGV);
	}
	} else {
	if (!TheBC)
	TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
	User->replaceUsesOfWith(CI, TheBC);
	}
	}

	Constant *RepValue = NewGV;
	if (NewGV->getType() != GV->getValueType())
	RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());

	// If there is a comparison against null, we will insert a global bool to
	// keep track of whether the global was initialized yet or not.
	GlobalVariable *InitBool =
	new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
	GlobalValue::InternalLinkage,
	ConstantInt::getFalse(GV->getContext()),
	GV->getName()+".init", GV->getThreadLocalMode());
	bool InitBoolUsed = false;

	// Loop over all uses of GV, processing them in turn.
	while (!GV->use_empty()) {
	if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
	// The global is initialized when the store to it occurs.
	new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
	Align(1), SI->getOrdering(), SI->getSyncScopeID(), SI);
	SI->eraseFromParent();
	continue;
	}

	LoadInst *LI = cast<LoadInst>(GV->user_back());
	while (!LI->use_empty()) {
	Use &LoadUse = *LI->use_begin();
	ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
	if (!ICI) {
	LoadUse = RepValue;
	continue;
	}

	// Replace the cmp X, 0 with a use of the bool value.
	// Sink the load to where the compare was, if atomic rules allow us to.
	Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
	InitBool->getName() + ".val", false, Align(1),
	LI->getOrdering(), LI->getSyncScopeID(),
	LI->isUnordered() ? (Instruction *)ICI : LI);
	InitBoolUsed = true;
	switch (ICI->getPredicate()) {
	default: llvm_unreachable("Unknown ICmp Predicate!");
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_SLT: // X < null -> always false
	LV = ConstantInt::getFalse(GV->getContext());
	break;
	case ICmpInst::ICMP_ULE:
	case ICmpInst::ICMP_SLE:
	case ICmpInst::ICMP_EQ:
	LV = BinaryOperator::CreateNot(LV, "notinit", ICI);
	break;
	case ICmpInst::ICMP_NE:
	case ICmpInst::ICMP_UGE:
	case ICmpInst::ICMP_SGE:
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_SGT:
	break; // no change.
	}
	ICI->replaceAllUsesWith(LV);
	ICI->eraseFromParent();
	}
	LI->eraseFromParent();
	}

	// If the initialization boolean was used, insert it, otherwise delete it.
	if (!InitBoolUsed) {
	while (!InitBool->use_empty()) // Delete initializations
	cast<StoreInst>(InitBool->user_back())->eraseFromParent();
	delete InitBool;
	} else
	GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);

	// Now the GV is dead, nuke it and the malloc..
	GV->eraseFromParent();
	CI->eraseFromParent();

	// To further other optimizations, loop over all users of NewGV and try to
	// constant prop them. This will promote GEP instructions with constant
	// indices into GEP constant-exprs, which will allow global-opt to hack on it.
	ConstantPropUsersOf(NewGV, DL, TLI);
	if (RepValue != NewGV)
	ConstantPropUsersOf(RepValue, DL, TLI);

	return NewGV;
	}

	/// Scan the use-list of V checking to make sure that there are no complex uses
	/// of V. We permit simple things like dereferencing the pointer, but not
	/// storing through the address, unless it is to the specified global.
	static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
	const GlobalVariable *GV,
	SmallPtrSetImpl<const PHINode*> &PHIs) {
	for (const User *U : V->users()) {
	const Instruction *Inst = cast<Instruction>(U);

	if (isa<LoadInst>(Inst) \|\| isa<CmpInst>(Inst)) {
	continue; // Fine, ignore.
	}

	if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
	if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
	return false; // Storing the pointer itself... bad.
	continue; // Otherwise, storing through it, or storing into GV... fine.
	}

	// Must index into the array and into the struct.
	if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) {
	if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
	return false;
	continue;
	}

	if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
	// PHIs are ok if all uses are ok. Don't infinitely recurse through PHI
	// cycles.
	if (PHIs.insert(PN).second)
	if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
	return false;
	continue;
	}

	if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
	if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
	return false;
	continue;
	}

	return false;
	}
	return true;
	}

	/// The Alloc pointer is stored into GV somewhere. Transform all uses of the
	/// allocation into loads from the global and uses of the resultant pointer.
	/// Further, delete the store into GV. This assumes that these value pass the
	/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
	static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
	GlobalVariable *GV) {
	while (!Alloc->use_empty()) {
	Instruction U = cast<Instruction>(Alloc->user_begin());
	Instruction *InsertPt = U;
	if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
	// If this is the store of the allocation into the global, remove it.
	if (SI->getOperand(1) == GV) {
	SI->eraseFromParent();
	continue;
	}
	} else if (PHINode *PN = dyn_cast<PHINode>(U)) {
	// Insert the load in the corresponding predecessor, not right before the
	// PHI.
	InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator();
	} else if (isa<BitCastInst>(U)) {
	// Must be bitcast between the malloc and store to initialize the global.
	ReplaceUsesOfMallocWithGlobal(U, GV);
	U->eraseFromParent();
	continue;
	} else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
	// If this is a "GEP bitcast" and the user is a store to the global, then
	// just process it as a bitcast.
	if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
	if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back()))
	if (SI->getOperand(1) == GV) {
	// Must be bitcast GEP between the malloc and store to initialize
	// the global.
	ReplaceUsesOfMallocWithGlobal(GEPI, GV);
	GEPI->eraseFromParent();
	continue;
	}
	}

	// Insert a load from the global, and use it instead of the malloc.
	Value *NL =
	new LoadInst(GV->getValueType(), GV, GV->getName() + ".val", InsertPt);
	U->replaceUsesOfWith(Alloc, NL);
	}
	}

	/// Verify that all uses of V (a load, or a phi of a load) are simple enough to
	/// perform heap SRA on. This permits GEP's that index through the array and
	/// struct field, icmps of null, and PHIs.
	static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
	SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
	SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
	// We permit two users of the load: setcc comparing against the null
	// pointer, and a getelementptr of a specific form.
	for (const User *U : V->users()) {
	const Instruction *UI = cast<Instruction>(U);

	// Comparison against null is ok.
	if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) {
	if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
	return false;
	continue;
	}

	// getelementptr is also ok, but only a simple form.
	if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
	// Must index into the array and into the struct.
	if (GEPI->getNumOperands() < 3)
	return false;

	// Otherwise the GEP is ok.
	continue;
	}

	if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
	if (!LoadUsingPHIsPerLoad.insert(PN).second)
	// This means some phi nodes are dependent on each other.
	// Avoid infinite looping!
	return false;
	if (!LoadUsingPHIs.insert(PN).second)
	// If we have already analyzed this PHI, then it is safe.
	continue;

	// Make sure all uses of the PHI are simple enough to transform.
	if (!LoadUsesSimpleEnoughForHeapSRA(PN,
	LoadUsingPHIs, LoadUsingPHIsPerLoad))
	return false;

	continue;
	}

	// Otherwise we don't know what this is, not ok.
	return false;
	}

	return true;
	}

	/// If all users of values loaded from GV are simple enough to perform HeapSRA,
	/// return true.
	static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
	Instruction *StoredVal) {
	SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
	SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
	for (const User *U : GV->users())
	if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
	if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
	LoadUsingPHIsPerLoad))
	return false;
	LoadUsingPHIsPerLoad.clear();
	}

	// If we reach here, we know that all uses of the loads and transitive uses
	// (through PHI nodes) are simple enough to transform. However, we don't know
	// that all inputs the to the PHI nodes are in the same equivalence sets.
	// Check to verify that all operands of the PHIs are either PHIS that can be
	// transformed, loads from GV, or MI itself.
	for (const PHINode *PN : LoadUsingPHIs) {
	for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
	Value *InVal = PN->getIncomingValue(op);

	// PHI of the stored value itself is ok.
	if (InVal == StoredVal) continue;

	if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
	// One of the PHIs in our set is (optimistically) ok.
	if (LoadUsingPHIs.count(InPN))
	continue;
	return false;
	}

	// Load from GV is ok.
	if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
	if (LI->getOperand(0) == GV)
	continue;

	// UNDEF? NULL?

	// Anything else is rejected.
	return false;
	}
	}

	return true;
	}

	static Value GetHeapSROAValue(Value V, unsigned FieldNo,
	DenseMap<Value , std::vector<Value >> &InsertedScalarizedValues,
	std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
	std::vector<Value *> &FieldVals = InsertedScalarizedValues[V];

	if (FieldNo >= FieldVals.size())
	FieldVals.resize(FieldNo+1);

	// If we already have this value, just reuse the previously scalarized
	// version.
	if (Value *FieldVal = FieldVals[FieldNo])
	return FieldVal;

	// Depending on what instruction this is, we have several cases.
	Value *Result;
	if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
	// This is a scalarized version of the load from the global. Just create
	// a new Load of the scalarized global.
	Value *V = GetHeapSROAValue(LI->getOperand(0), FieldNo,
	InsertedScalarizedValues, PHIsToRewrite);
	Result = new LoadInst(V->getType()->getPointerElementType(), V,
	LI->getName() + ".f" + Twine(FieldNo), LI);
	} else {
	PHINode *PN = cast<PHINode>(V);
	// PN's type is pointer to struct. Make a new PHI of pointer to struct
	// field.

	PointerType *PTy = cast<PointerType>(PN->getType());
	StructType *ST = cast<StructType>(PTy->getElementType());

	unsigned AS = PTy->getAddressSpace();
	PHINode *NewPN =
	PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS),
	PN->getNumIncomingValues(),
	PN->getName()+".f"+Twine(FieldNo), PN);
	Result = NewPN;
	PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
	}

	return FieldVals[FieldNo] = Result;
	}

	/// Given a load instruction and a value derived from the load, rewrite the
	/// derived value to use the HeapSRoA'd load.
	static void RewriteHeapSROALoadUser(Instruction *LoadUser,
	DenseMap<Value , std::vector<Value >> &InsertedScalarizedValues,
	std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
	// If this is a comparison against null, handle it.
	if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
	assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
	// If we have a setcc of the loaded pointer, we can use a setcc of any
	// field.
	Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
	InsertedScalarizedValues, PHIsToRewrite);

	Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
	Constant::getNullValue(NPtr->getType()),
	SCI->getName());
	SCI->replaceAllUsesWith(New);
	SCI->eraseFromParent();
	return;
	}

	// Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
	if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
	assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
	&& "Unexpected GEPI!");

	// Load the pointer for this field.
	unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
	Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
	InsertedScalarizedValues, PHIsToRewrite);

	// Create the new GEP idx vector.
	SmallVector<Value*, 8> GEPIdx;
	GEPIdx.push_back(GEPI->getOperand(1));
	GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());

	Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx,
	GEPI->getName(), GEPI);
	GEPI->replaceAllUsesWith(NGEPI);
	GEPI->eraseFromParent();
	return;
	}

	// Recursively transform the users of PHI nodes. This will lazily create the
	// PHIs that are needed for individual elements. Keep track of what PHIs we
	// see in InsertedScalarizedValues so that we don't get infinite loops (very
	// antisocial). If the PHI is already in InsertedScalarizedValues, it has
	// already been seen first by another load, so its uses have already been
	// processed.
	PHINode *PN = cast<PHINode>(LoadUser);
	if (!InsertedScalarizedValues.insert(std::make_pair(PN,
	std::vector<Value *>())).second)
	return;

	// If this is the first time we've seen this PHI, recursively process all
	// users.
	for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
	Instruction User = cast<Instruction>(UI++);
	RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
	}
	}

	/// We are performing Heap SRoA on a global. Ptr is a value loaded from the
	/// global. Eliminate all uses of Ptr, making them use FieldGlobals instead.
	/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
	static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
	DenseMap<Value , std::vector<Value >> &InsertedScalarizedValues,
	std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) {
	for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) {
	Instruction User = cast<Instruction>(UI++);
	RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
	}

	if (Load->use_empty()) {
	Load->eraseFromParent();
	InsertedScalarizedValues.erase(Load);
	}
	}

	/// CI is an allocation of an array of structures. Break it up into multiple
	/// allocations of arrays of the fields.
	static GlobalVariable PerformHeapAllocSRoA(GlobalVariable GV, CallInst *CI,
	Value *NElems, const DataLayout &DL,
	const TargetLibraryInfo *TLI) {
	LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << GV << " MALLOC = " << CI
	<< '\n');
	Type *MAT = getMallocAllocatedType(CI, TLI);
	StructType *STy = cast<StructType>(MAT);

	// There is guaranteed to be at least one use of the malloc (storing
	// it into GV). If there are other uses, change them to be uses of
	// the global to simplify later code. This also deletes the store
	// into GV.
	ReplaceUsesOfMallocWithGlobal(CI, GV);

	// Okay, at this point, there are no users of the malloc. Insert N
	// new mallocs at the same place as CI, and N globals.
	std::vector<Value *> FieldGlobals;
	std::vector<Value *> FieldMallocs;

	SmallVector<OperandBundleDef, 1> OpBundles;
	CI->getOperandBundlesAsDefs(OpBundles);

	unsigned AS = GV->getType()->getPointerAddressSpace();
	for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
	Type *FieldTy = STy->getElementType(FieldNo);
	PointerType *PFieldTy = PointerType::get(FieldTy, AS);

	GlobalVariable *NGV = new GlobalVariable(
	*GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
	Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
	nullptr, GV->getThreadLocalMode());
	NGV->copyAttributesFrom(GV);
	FieldGlobals.push_back(NGV);

	unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
	if (StructType *ST = dyn_cast<StructType>(FieldTy))
	TypeSize = DL.getStructLayout(ST)->getSizeInBytes();
	Type *IntPtrTy = DL.getIntPtrType(CI->getType());
	Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
	ConstantInt::get(IntPtrTy, TypeSize),
	NElems, OpBundles, nullptr,
	CI->getName() + ".f" + Twine(FieldNo));
	FieldMallocs.push_back(NMI);
	new StoreInst(NMI, NGV, CI);
	}

	// The tricky aspect of this transformation is handling the case when malloc
	// fails. In the original code, malloc failing would set the result pointer
	// of malloc to null. In this case, some mallocs could succeed and others
	// could fail. As such, we emit code that looks like this:
	// F0 = malloc(field0)
	// F1 = malloc(field1)
	// F2 = malloc(field2)
	// if (F0 == 0 \|\| F1 == 0 \|\| F2 == 0) {
	// if (F0) { free(F0); F0 = 0; }
	// if (F1) { free(F1); F1 = 0; }
	// if (F2) { free(F2); F2 = 0; }
	// }
	// The malloc can also fail if its argument is too large.
	Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
	Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
	ConstantZero, "isneg");
	for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
	Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
	Constant::getNullValue(FieldMallocs[i]->getType()),
	"isnull");
	RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
	}

	// Split the basic block at the old malloc.
	BasicBlock *OrigBB = CI->getParent();
	BasicBlock *ContBB =
	OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");

	// Create the block to check the first condition. Put all these blocks at the
	// end of the function as they are unlikely to be executed.
	BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
	"malloc_ret_null",
	OrigBB->getParent());

	// Remove the uncond branch from OrigBB to ContBB, turning it into a cond
	// branch on RunningOr.
	OrigBB->getTerminator()->eraseFromParent();
	BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);

	// Within the NullPtrBlock, we need to emit a comparison and branch for each
	// pointer, because some may be null while others are not.
	for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
	Value *GVVal =
	new LoadInst(cast<GlobalVariable>(FieldGlobals[i])->getValueType(),
	FieldGlobals[i], "tmp", NullPtrBlock);
	Value Cmp = new ICmpInst(NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
	Constant::getNullValue(GVVal->getType()));
	BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
	OrigBB->getParent());
	BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
	OrigBB->getParent());
	Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
	Cmp, NullPtrBlock);

	// Fill in FreeBlock.
	CallInst::CreateFree(GVVal, OpBundles, BI);
	new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
	FreeBlock);
	BranchInst::Create(NextBlock, FreeBlock);

	NullPtrBlock = NextBlock;
	}

	BranchInst::Create(ContBB, NullPtrBlock);

	// CI is no longer needed, remove it.
	CI->eraseFromParent();

	/// As we process loads, if we can't immediately update all uses of the load,
	/// keep track of what scalarized loads are inserted for a given load.
	DenseMap<Value , std::vector<Value >> InsertedScalarizedValues;
	InsertedScalarizedValues[GV] = FieldGlobals;

	std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite;

	// Okay, the malloc site is completely handled. All of the uses of GV are now
	// loads, and all uses of those loads are simple. Rewrite them to use loads
	// of the per-field globals instead.
	for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) {
	Instruction User = cast<Instruction>(UI++);

	if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
	RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
	continue;
	}

	// Must be a store of null.
	StoreInst *SI = cast<StoreInst>(User);
	assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
	"Unexpected heap-sra user!");

	// Insert a store of null into each global.
	for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
	Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType();
	Constant *Null = Constant::getNullValue(ValTy);
	new StoreInst(Null, FieldGlobals[i], SI);
	}
	// Erase the original store.
	SI->eraseFromParent();
	}

	// While we have PHIs that are interesting to rewrite, do it.
	while (!PHIsToRewrite.empty()) {
	PHINode *PN = PHIsToRewrite.back().first;
	unsigned FieldNo = PHIsToRewrite.back().second;
	PHIsToRewrite.pop_back();
	PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
	assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");

	// Add all the incoming values. This can materialize more phis.
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	Value *InVal = PN->getIncomingValue(i);
	InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
	PHIsToRewrite);
	FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
	}
	}

	// Drop all inter-phi links and any loads that made it this far.
	for (DenseMap<Value , std::vector<Value >>::iterator
	I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
	I != E; ++I) {
	if (PHINode *PN = dyn_cast<PHINode>(I->first))
	PN->dropAllReferences();
	else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
	LI->dropAllReferences();
	}

	// Delete all the phis and loads now that inter-references are dead.
	for (DenseMap<Value , std::vector<Value >>::iterator
	I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
	I != E; ++I) {
	if (PHINode *PN = dyn_cast<PHINode>(I->first))
	PN->eraseFromParent();
	else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
	LI->eraseFromParent();
	}

	// The old global is now dead, remove it.
	GV->eraseFromParent();

	++NumHeapSRA;
	return cast<GlobalVariable>(FieldGlobals[0]);
	}

	/// This function is called when we see a pointer global variable with a single
	/// value stored it that is a malloc or cast of malloc.
	static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable GV, CallInst CI,
	Type *AllocTy,
	AtomicOrdering Ordering,
	const DataLayout &DL,
	TargetLibraryInfo *TLI) {
	// If this is a malloc of an abstract type, don't touch it.
	if (!AllocTy->isSized())
	return false;

	// We can't optimize this global unless all uses of it are known to be
	// of the malloc value, not of the null initializer value (consider a use
	// that compares the global's value against zero to see if the malloc has
	// been reached). To do this, we check to see if all uses of the global
	// would trap if the global were null: this proves that they must all
	// happen after the malloc.
	if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
	return false;

	// We can't optimize this if the malloc itself is used in a complex way,
	// for example, being stored into multiple globals. This allows the
	// malloc to be stored into the specified global, loaded icmp'd, and
	// GEP'd. These are all things we could transform to using the global
	// for.
	SmallPtrSet<const PHINode*, 8> PHIs;
	if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs))
	return false;

	// If we have a global that is only initialized with a fixed size malloc,
	// transform the program to use global memory instead of malloc'd memory.
	// This eliminates dynamic allocation, avoids an indirection accessing the
	// data, and exposes the resultant global to further GlobalOpt.
	// We cannot optimize the malloc if we cannot determine malloc array size.
	Value *NElems = getMallocArraySize(CI, DL, TLI, true);
	if (!NElems)
	return false;

	if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
	// Restrict this transformation to only working on small allocations
	// (2048 bytes currently), as we don't want to introduce a 16M global or
	// something.
	if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
	OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
	return true;
	}

	// If the allocation is an array of structures, consider transforming this
	// into multiple malloc'd arrays, one for each field. This is basically
	// SRoA for malloc'd memory.

	if (Ordering != AtomicOrdering::NotAtomic)
	return false;

	// If this is an allocation of a fixed size array of structs, analyze as a
	// variable size array. malloc [100 x struct],1 -> malloc struct, 100
	if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
	if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
	AllocTy = AT->getElementType();

	StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
	if (!AllocSTy)
	return false;

	// This the structure has an unreasonable number of fields, leave it
	// alone.
	if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
	AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) {

	// If this is a fixed size array, transform the Malloc to be an alloc of
	// structs. malloc [100 x struct],1 -> malloc struct, 100
	if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
	Type *IntPtrTy = DL.getIntPtrType(CI->getType());
	unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
	Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
	Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
	SmallVector<OperandBundleDef, 1> OpBundles;
	CI->getOperandBundlesAsDefs(OpBundles);
	Instruction *Malloc =
	CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements,
	OpBundles, nullptr, CI->getName());
	Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
	CI->replaceAllUsesWith(Cast);
	CI->eraseFromParent();
	if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
	CI = cast<CallInst>(BCI->getOperand(0));
	else
	CI = cast<CallInst>(Malloc);
	}

	PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL,
	TLI);
	return true;
	}

	return false;
	}

	// Try to optimize globals based on the knowledge that only one value (besides
	// its initializer) is ever stored to the global.
	static bool
	optimizeOnceStoredGlobal(GlobalVariable GV, Value StoredOnceVal,
	AtomicOrdering Ordering, const DataLayout &DL,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
	// Ignore no-op GEPs and bitcasts.
	StoredOnceVal = StoredOnceVal->stripPointerCasts();

	// If we are dealing with a pointer global that is initialized to null and
	// only has one (non-null) value stored into it, then we can optimize any
	// users of the loaded value (often calls and loads) that would trap if the
	// value was null.
	if (GV->getInitializer()->getType()->isPointerTy() &&
	GV->getInitializer()->isNullValue() &&
	!NullPointerIsDefined(
	nullptr /* F */,
	GV->getInitializer()->getType()->getPointerAddressSpace())) {
	if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
	if (GV->getInitializer()->getType() != SOVC->getType())
	SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());

	// Optimize away any trapping uses of the loaded value.
	if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
	return true;
	} else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
	auto TLI = &GetTLI(CI->getFunction());
	Type *MallocType = getMallocAllocatedType(CI, TLI);
	if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
	Ordering, DL, TLI))
	return true;
	}
	}

	return false;
	}

	/// At this point, we have learned that the only two values ever stored into GV
	/// are its initializer and OtherVal. See if we can shrink the global into a
	/// boolean and select between the two values whenever it is used. This exposes
	/// the values to other scalar optimizations.
	static bool TryToShrinkGlobalToBoolean(GlobalVariable GV, Constant OtherVal) {
	Type *GVElType = GV->getValueType();

	// If GVElType is already i1, it is already shrunk. If the type of the GV is
	// an FP value, pointer or vector, don't do this optimization because a select
	// between them is very expensive and unlikely to lead to later
	// simplification. In these cases, we typically end up with "cond ? v1 : v2"
	// where v1 and v2 both require constant pool loads, a big loss.
	if (GVElType == Type::getInt1Ty(GV->getContext()) \|\|
	GVElType->isFloatingPointTy() \|\|
	GVElType->isPointerTy() \|\| GVElType->isVectorTy())
	return false;

	// Walk the use list of the global seeing if all the uses are load or store.
	// If there is anything else, bail out.
	for (User *U : GV->users())
	if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
	return false;

	LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n");

	// Create the new global, initializing it to false.
	GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
	false,
	GlobalValue::InternalLinkage,
	ConstantInt::getFalse(GV->getContext()),
	GV->getName()+".b",
	GV->getThreadLocalMode(),
	GV->getType()->getAddressSpace());
	NewGV->copyAttributesFrom(GV);
	GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);

	Constant *InitVal = GV->getInitializer();
	assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
	"No reason to shrink to bool!");

	SmallVector<DIGlobalVariableExpression *, 1> GVs;
	GV->getDebugInfo(GVs);

	// If initialized to zero and storing one into the global, we can use a cast
	// instead of a select to synthesize the desired value.
	bool IsOneZero = false;
	bool EmitOneOrZero = true;
	auto *CI = dyn_cast<ConstantInt>(OtherVal);
	if (CI && CI->getValue().getActiveBits() <= 64) {
	IsOneZero = InitVal->isNullValue() && CI->isOne();

	auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer());
	if (CIInit && CIInit->getValue().getActiveBits() <= 64) {
	uint64_t ValInit = CIInit->getZExtValue();
	uint64_t ValOther = CI->getZExtValue();
	uint64_t ValMinus = ValOther - ValInit;

	for(auto *GVe : GVs){
	DIGlobalVariable *DGV = GVe->getVariable();
	DIExpression *E = GVe->getExpression();
	const DataLayout &DL = GV->getParent()->getDataLayout();
	unsigned SizeInOctets =
	DL.getTypeAllocSizeInBits(NewGV->getType()->getElementType()) / 8;

	// It is expected that the address of global optimized variable is on
	// top of the stack. After optimization, value of that variable will
	// be ether 0 for initial value or 1 for other value. The following
	// expression should return constant integer value depending on the
	// value at global object address:
	// val * (ValOther - ValInit) + ValInit:
	// DW_OP_deref DW_OP_constu <ValMinus>
	// DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
	SmallVector<uint64_t, 12> Ops = {
	dwarf::DW_OP_deref_size, SizeInOctets,
	dwarf::DW_OP_constu, ValMinus,
	dwarf::DW_OP_mul, dwarf::DW_OP_constu, ValInit,
	dwarf::DW_OP_plus};
	bool WithStackValue = true;
	E = DIExpression::prependOpcodes(E, Ops, WithStackValue);
	DIGlobalVariableExpression *DGVE =
	DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
	NewGV->addDebugInfo(DGVE);
	}
	EmitOneOrZero = false;
	}
	}

	if (EmitOneOrZero) {
	// FIXME: This will only emit address for debugger on which will
	// be written only 0 or 1.
	for(auto *GV : GVs)
	NewGV->addDebugInfo(GV);
	}

	while (!GV->use_empty()) {
	Instruction *UI = cast<Instruction>(GV->user_back());
	if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
	// Change the store into a boolean store.
	bool StoringOther = SI->getOperand(0) == OtherVal;
	// Only do this if we weren't storing a loaded value.
	Value *StoreVal;
	if (StoringOther \|\| SI->getOperand(0) == InitVal) {
	StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
	StoringOther);
	} else {
	// Otherwise, we are storing a previously loaded copy. To do this,
	// change the copy from copying the original value to just copying the
	// bool.
	Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));

	// If we've already replaced the input, StoredVal will be a cast or
	// select instruction. If not, it will be a load of the original
	// global.
	if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
	assert(LI->getOperand(0) == GV && "Not a copy!");
	// Insert a new load, to preserve the saved value.
	StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
	LI->getName() + ".b", false, Align(1),
	LI->getOrdering(), LI->getSyncScopeID(), LI);
	} else {
	assert((isa<CastInst>(StoredVal) \|\| isa<SelectInst>(StoredVal)) &&
	"This is not a form that we understand!");
	StoreVal = StoredVal->getOperand(0);
	assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
	}
	}
	StoreInst *NSI =
	new StoreInst(StoreVal, NewGV, false, Align(1), SI->getOrdering(),
	SI->getSyncScopeID(), SI);
	NSI->setDebugLoc(SI->getDebugLoc());
	} else {
	// Change the load into a load of bool then a select.
	LoadInst *LI = cast<LoadInst>(UI);
	LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
	LI->getName() + ".b", false, Align(1),
	LI->getOrdering(), LI->getSyncScopeID(), LI);
	Instruction *NSI;
	if (IsOneZero)
	NSI = new ZExtInst(NLI, LI->getType(), "", LI);
	else
	NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
	NSI->takeName(LI);
	// Since LI is split into two instructions, NLI and NSI both inherit the
	// same DebugLoc
	NLI->setDebugLoc(LI->getDebugLoc());
	NSI->setDebugLoc(LI->getDebugLoc());
	LI->replaceAllUsesWith(NSI);
	}
	UI->eraseFromParent();
	}

	// Retain the name of the old global variable. People who are debugging their
	// programs may expect these variables to be named the same.
	NewGV->takeName(GV);
	GV->eraseFromParent();
	return true;
	}

	static bool deleteIfDead(
	GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
	GV.removeDeadConstantUsers();

	if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
	return false;

	if (const Comdat *C = GV.getComdat())
	if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C))
	return false;

	bool Dead;
	if (auto *F = dyn_cast<Function>(&GV))
	Dead = (F->isDeclaration() && F->use_empty()) \|\| F->isDefTriviallyDead();
	else
	Dead = GV.use_empty();
	if (!Dead)
	return false;

	LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
	GV.eraseFromParent();
	++NumDeleted;
	return true;
	}

	static bool isPointerValueDeadOnEntryToFunction(
	const Function F, GlobalValue GV,
	function_ref<DominatorTree &(Function &)> LookupDomTree) {
	// Find all uses of GV. We expect them all to be in F, and if we can't
	// identify any of the uses we bail out.
	//
	// On each of these uses, identify if the memory that GV points to is
	// used/required/live at the start of the function. If it is not, for example
	// if the first thing the function does is store to the GV, the GV can
	// possibly be demoted.
	//
	// We don't do an exhaustive search for memory operations - simply look
	// through bitcasts as they're quite common and benign.
	const DataLayout &DL = GV->getParent()->getDataLayout();
	SmallVector<LoadInst *, 4> Loads;
	SmallVector<StoreInst *, 4> Stores;
	for (auto *U : GV->users()) {
	if (Operator::getOpcode(U) == Instruction::BitCast) {
	for (auto *UU : U->users()) {
	if (auto *LI = dyn_cast<LoadInst>(UU))
	Loads.push_back(LI);
	else if (auto *SI = dyn_cast<StoreInst>(UU))
	Stores.push_back(SI);
	else
	return false;
	}
	continue;
	}

	Instruction *I = dyn_cast<Instruction>(U);
	if (!I)
	return false;
	assert(I->getParent()->getParent() == F);

	if (auto *LI = dyn_cast<LoadInst>(I))
	Loads.push_back(LI);
	else if (auto *SI = dyn_cast<StoreInst>(I))
	Stores.push_back(SI);
	else
	return false;
	}

	// We have identified all uses of GV into loads and stores. Now check if all
	// of them are known not to depend on the value of the global at the function
	// entry point. We do this by ensuring that every load is dominated by at
	// least one store.
	auto &DT = LookupDomTree(const_cast<Function >(F));

	// The below check is quadratic. Check we're not going to do too many tests.
	// FIXME: Even though this will always have worst-case quadratic time, we
	// could put effort into minimizing the average time by putting stores that
	// have been shown to dominate at least one load at the beginning of the
	// Stores array, making subsequent dominance checks more likely to succeed
	// early.
	//
	// The threshold here is fairly large because global->local demotion is a
	// very powerful optimization should it fire.
	const unsigned Threshold = 100;
	if (Loads.size() * Stores.size() > Threshold)
	return false;

	for (auto *L : Loads) {
	auto *LTy = L->getType();
	if (none_of(Stores, [&](const StoreInst *S) {
	auto *STy = S->getValueOperand()->getType();
	// The load is only dominated by the store if DomTree says so
	// and the number of bits loaded in L is less than or equal to
	// the number of bits stored in S.
	return DT.dominates(S, L) &&
	DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy);
	}))
	return false;
	}
	// All loads have known dependences inside F, so the global can be localized.
	return true;
	}

	/// C may have non-instruction users. Can all of those users be turned into
	/// instructions?
	static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) {
	// We don't do this exhaustively. The most common pattern that we really need
	// to care about is a constant GEP or constant bitcast - so just looking
	// through one single ConstantExpr.
	//
	// The set of constants that this function returns true for must be able to be
	// handled by makeAllConstantUsesInstructions.
	for (auto *U : C->users()) {
	if (isa<Instruction>(U))
	continue;
	if (!isa<ConstantExpr>(U))
	// Non instruction, non-constantexpr user; cannot convert this.
	return false;
	for (auto *UU : U->users())
	if (!isa<Instruction>(UU))
	// A constantexpr used by another constant. We don't try and recurse any
	// further but just bail out at this point.
	return false;
	}

	return true;
	}

	/// C may have non-instruction users, and
	/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the
	/// non-instruction users to instructions.
	static void makeAllConstantUsesInstructions(Constant *C) {
	SmallVector<ConstantExpr*,4> Users;
	for (auto *U : C->users()) {
	if (isa<ConstantExpr>(U))
	Users.push_back(cast<ConstantExpr>(U));
	else
	// We should never get here; allNonInstructionUsersCanBeMadeInstructions
	// should not have returned true for C.
	assert(
	isa<Instruction>(U) &&
	"Can't transform non-constantexpr non-instruction to instruction!");
	}

	SmallVector<Value*,4> UUsers;
	for (auto *U : Users) {
	UUsers.clear();
	for (auto *UU : U->users())
	UUsers.push_back(UU);
	for (auto *UU : UUsers) {
	Instruction *UI = cast<Instruction>(UU);
	Instruction *NewU = U->getAsInstruction();
	NewU->insertBefore(UI);
	UI->replaceUsesOfWith(U, NewU);
	}
	// We've replaced all the uses, so destroy the constant. (destroyConstant
	// will update value handles and metadata.)
	U->destroyConstant();
	}
	}

	/// Analyze the specified global variable and optimize
	/// it if possible. If we make a change, return true.
	static bool
	processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI,
	function_ref<DominatorTree &(Function &)> LookupDomTree) {
	auto &DL = GV->getParent()->getDataLayout();
	// If this is a first class global and has only one accessing function and
	// this function is non-recursive, we replace the global with a local alloca
	// in this function.
	//
	// NOTE: It doesn't make sense to promote non-single-value types since we
	// are just replacing static memory to stack memory.
	//
	// If the global is in different address space, don't bring it to stack.
	if (!GS.HasMultipleAccessingFunctions &&
	GS.AccessingFunction &&
	GV->getValueType()->isSingleValueType() &&
	GV->getType()->getAddressSpace() == 0 &&
	!GV->isExternallyInitialized() &&
	allNonInstructionUsersCanBeMadeInstructions(GV) &&
	GS.AccessingFunction->doesNotRecurse() &&
	isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
	LookupDomTree)) {
	const DataLayout &DL = GV->getParent()->getDataLayout();

	LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
	Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
	->getEntryBlock().begin());
	Type *ElemTy = GV->getValueType();
	// FIXME: Pass Global's alignment when globals have alignment
	AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
	GV->getName(), &FirstI);
	if (!isa<UndefValue>(GV->getInitializer()))
	new StoreInst(GV->getInitializer(), Alloca, &FirstI);

	makeAllConstantUsesInstructions(GV);

	GV->replaceAllUsesWith(Alloca);
	GV->eraseFromParent();
	++NumLocalized;
	return true;
	}

	// If the global is never loaded (but may be stored to), it is dead.
	// Delete it now.
	if (!GS.IsLoaded) {
	LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");

	bool Changed;
	if (isLeakCheckerRoot(GV)) {
	// Delete any constant stores to the global.
	Changed = CleanupPointerRootUsers(GV, GetTLI);
	} else {
	// Delete any stores we can find to the global. We may not be able to
	// make it completely dead though.
	Changed =
	CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
	}

	// If the global is dead now, delete it.
	if (GV->use_empty()) {
	GV->eraseFromParent();
	++NumDeleted;
	Changed = true;
	}
	return Changed;

	}
	if (GS.StoredType <= GlobalStatus::InitializerStored) {
	LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");

	// Don't actually mark a global constant if it's atomic because atomic loads
	// are implemented by a trivial cmpxchg in some edge-cases and that usually
	// requires write access to the variable even if it's not actually changed.
	if (GS.Ordering == AtomicOrdering::NotAtomic)
	GV->setConstant(true);

	// Clean up any obviously simplifiable users now.
	CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);

	// If the global is dead now, just nuke it.
	if (GV->use_empty()) {
	LLVM_DEBUG(dbgs() << " *** Marking constant allowed us to simplify "
	<< "all users and delete global!\n");
	GV->eraseFromParent();
	++NumDeleted;
	return true;
	}

	// Fall through to the next check; see if we can optimize further.
	++NumMarked;
	}
	if (!GV->getInitializer()->getType()->isSingleValueType()) {
	const DataLayout &DL = GV->getParent()->getDataLayout();
	if (SRAGlobal(GV, DL))
	return true;
	}
	if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
	// If the initial value for the global was an undef value, and if only
	// one other value was stored into it, we can just change the
	// initializer to be the stored value, then delete all stores to the
	// global. This allows us to mark it constant.
	if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
	if (isa<UndefValue>(GV->getInitializer())) {
	// Change the initial value here.
	GV->setInitializer(SOVConstant);

	// Clean up any obviously simplifiable users now.
	CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);

	if (GV->use_empty()) {
	LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to "
	<< "simplify all users and delete global!\n");
	GV->eraseFromParent();
	++NumDeleted;
	}
	++NumSubstitute;
	return true;
	}

	// Try to optimize globals based on the knowledge that only one value
	// (besides its initializer) is ever stored to the global.
	if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
	GetTLI))
	return true;

	// Otherwise, if the global was not a boolean, we can shrink it to be a
	// boolean.
	if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
	if (GS.Ordering == AtomicOrdering::NotAtomic) {
	if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
	++NumShrunkToBool;
	return true;
	}
	}
	}
	}

	return false;
	}

	/// Analyze the specified global variable and optimize it if possible. If we
	/// make a change, return true.
	static bool
	processGlobal(GlobalValue &GV,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI,
	function_ref<DominatorTree &(Function &)> LookupDomTree) {
	if (GV.getName().startswith("llvm."))
	return false;

	GlobalStatus GS;

	if (GlobalStatus::analyzeGlobal(&GV, GS))
	return false;

	bool Changed = false;
	if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) {
	auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global
	: GlobalValue::UnnamedAddr::Local;
	if (NewUnnamedAddr != GV.getUnnamedAddr()) {
	GV.setUnnamedAddr(NewUnnamedAddr);
	NumUnnamed++;
	Changed = true;
	}
	}

	// Do more involved optimizations if the global is internal.
	if (!GV.hasLocalLinkage())
	return Changed;

	auto *GVar = dyn_cast<GlobalVariable>(&GV);
	if (!GVar)
	return Changed;

	if (GVar->isConstant() \|\| !GVar->hasInitializer())
	return Changed;

	return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) \|\| Changed;
	}

	/// Walk all of the direct calls of the specified function, changing them to
	/// FastCC.
	static void ChangeCalleesToFastCall(Function *F) {
	for (User *U : F->users()) {
	if (isa<BlockAddress>(U))
	continue;
	cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
	}
	}

	static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
	Attribute::AttrKind A) {
	unsigned AttrIndex;
	if (Attrs.hasAttrSomewhere(A, &AttrIndex))
	return Attrs.removeAttribute(C, AttrIndex, A);
	return Attrs;
	}

	static void RemoveAttribute(Function *F, Attribute::AttrKind A) {
	F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A));
	for (User *U : F->users()) {
	if (isa<BlockAddress>(U))
	continue;
	CallBase *CB = cast<CallBase>(U);
	CB->setAttributes(StripAttr(F->getContext(), CB->getAttributes(), A));
	}
	}

	/// Return true if this is a calling convention that we'd like to change. The
	/// idea here is that we don't want to mess with the convention if the user
	/// explicitly requested something with performance implications like coldcc,
	/// GHC, or anyregcc.
	static bool hasChangeableCC(Function *F) {
	CallingConv::ID CC = F->getCallingConv();

	// FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
	if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
	return false;

	// FIXME: Change CC for the whole chain of musttail calls when possible.
	//
	// Can't change CC of the function that either has musttail calls, or is a
	// musttail callee itself
	for (User *U : F->users()) {
	if (isa<BlockAddress>(U))
	continue;
	CallInst* CI = dyn_cast<CallInst>(U);
	if (!CI)
	continue;

	if (CI->isMustTailCall())
	return false;
	}

	for (BasicBlock &BB : *F)
	if (BB.getTerminatingMustTailCall())
	return false;

	return true;
	}

	/// Return true if the block containing the call site has a BlockFrequency of
	/// less than ColdCCRelFreq% of the entry block.
	static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI) {
	const BranchProbability ColdProb(ColdCCRelFreq, 100);
	auto *CallSiteBB = CB.getParent();
	auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
	auto CallerEntryFreq =
	CallerBFI.getBlockFreq(&(CB.getCaller()->getEntryBlock()));
	return CallSiteFreq < CallerEntryFreq * ColdProb;
	}

	// This function checks if the input function F is cold at all call sites. It
	// also looks each call site's containing function, returning false if the
	// caller function contains other non cold calls. The input vector AllCallsCold
	// contains a list of functions that only have call sites in cold blocks.
	static bool
	isValidCandidateForColdCC(Function &F,
	function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
	const std::vector<Function *> &AllCallsCold) {

	if (F.user_empty())
	return false;

	for (User *U : F.users()) {
	if (isa<BlockAddress>(U))
	continue;

	CallBase &CB = cast<CallBase>(*U);
	Function *CallerFunc = CB.getParent()->getParent();
	BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
	if (!isColdCallSite(CB, CallerBFI))
	return false;
	auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc);
	if (It == AllCallsCold.end())
	return false;
	}
	return true;
	}

	static void changeCallSitesToColdCC(Function *F) {
	for (User *U : F->users()) {
	if (isa<BlockAddress>(U))
	continue;
	cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
	}
	}

	// This function iterates over all the call instructions in the input Function
	// and checks that all call sites are in cold blocks and are allowed to use the
	// coldcc calling convention.
	static bool
	hasOnlyColdCalls(Function &F,
	function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
	for (BasicBlock &BB : F) {
	for (Instruction &I : BB) {
	if (CallInst *CI = dyn_cast<CallInst>(&I)) {
	// Skip over isline asm instructions since they aren't function calls.
	if (CI->isInlineAsm())
	continue;
	Function *CalledFn = CI->getCalledFunction();
	if (!CalledFn)
	return false;
	if (!CalledFn->hasLocalLinkage())
	return false;
	// Skip over instrinsics since they won't remain as function calls.
	if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
	continue;
	// Check if it's valid to use coldcc calling convention.
	if (!hasChangeableCC(CalledFn) \|\| CalledFn->isVarArg() \|\|
	CalledFn->hasAddressTaken())
	return false;
	BlockFrequencyInfo &CallerBFI = GetBFI(F);
	if (!isColdCallSite(*CI, CallerBFI))
	return false;
	}
	}
	}
	return true;
	}

	static bool hasMustTailCallers(Function *F) {
	for (User *U : F->users()) {
	CallBase *CB = dyn_cast<CallBase>(U);
	if (!CB) {
	assert(isa<BlockAddress>(U) &&
	"Expected either CallBase or BlockAddress");
	continue;
	}
	if (CB->isMustTailCall())
	return true;
	}
	return false;
	}

	static bool hasInvokeCallers(Function *F) {
	for (User *U : F->users())
	if (isa<InvokeInst>(U))
	return true;
	return false;
	}

	static void RemovePreallocated(Function *F) {
	RemoveAttribute(F, Attribute::Preallocated);

	auto *M = F->getParent();

	IRBuilder<> Builder(M->getContext());

	// Cannot modify users() while iterating over it, so make a copy.
	SmallVector<User *, 4> PreallocatedCalls(F->users());
	for (User *U : PreallocatedCalls) {
	CallBase *CB = dyn_cast<CallBase>(U);
	if (!CB)
	continue;

	assert(
	!CB->isMustTailCall() &&
	"Shouldn't call RemotePreallocated() on a musttail preallocated call");
	// Create copy of call without "preallocated" operand bundle.
	SmallVector<OperandBundleDef, 1> OpBundles;
	CB->getOperandBundlesAsDefs(OpBundles);
	CallBase *PreallocatedSetup = nullptr;
	for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) {
	if (It->getTag() == "preallocated") {
	PreallocatedSetup = cast<CallBase>(*It->input_begin());
	OpBundles.erase(It);
	break;
	}
	}
	assert(PreallocatedSetup && "Did not find preallocated bundle");
	uint64_t ArgCount =
	cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue();

	assert((isa<CallInst>(CB) \|\| isa<InvokeInst>(CB)) &&
	"Unknown indirect call type");
	CallBase *NewCB = CallBase::Create(CB, OpBundles, CB);
	CB->replaceAllUsesWith(NewCB);
	NewCB->takeName(CB);
	CB->eraseFromParent();

	Builder.SetInsertPoint(PreallocatedSetup);
	auto *StackSave =
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));

	Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
	StackSave);

	// Replace @llvm.call.preallocated.arg() with alloca.
	// Cannot modify users() while iterating over it, so make a copy.
	// @llvm.call.preallocated.arg() can be called with the same index multiple
	// times. So for each @llvm.call.preallocated.arg(), we see if we have
	// already created a Value* for the index, and if not, create an alloca and
	// bitcast right after the @llvm.call.preallocated.setup() so that it
	// dominates all uses.
	SmallVector<Value *, 2> ArgAllocas(ArgCount);
	SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users());
	for (auto *User : PreallocatedArgs) {
	auto *UseCall = cast<CallBase>(User);
	assert(UseCall->getCalledFunction()->getIntrinsicID() ==
	Intrinsic::call_preallocated_arg &&
	"preallocated token use was not a llvm.call.preallocated.arg");
	uint64_t AllocArgIndex =
	cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue();
	Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
	if (!AllocaReplacement) {
	auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
	auto *ArgType = UseCall
	->getAttribute(AttributeList::FunctionIndex,
	Attribute::Preallocated)
	.getValueAsType();
	auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
	Builder.SetInsertPoint(InsertBefore);
	auto *Alloca =
	Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
	auto *BitCast = Builder.CreateBitCast(
	Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName());
	ArgAllocas[AllocArgIndex] = BitCast;
	AllocaReplacement = BitCast;
	}

	UseCall->replaceAllUsesWith(AllocaReplacement);
	UseCall->eraseFromParent();
	}
	// Remove @llvm.call.preallocated.setup().
	cast<Instruction>(PreallocatedSetup)->eraseFromParent();
	}
	}

	static bool
	OptimizeFunctions(Module &M,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI,
	function_ref<TargetTransformInfo &(Function &)> GetTTI,
	function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
	function_ref<DominatorTree &(Function &)> LookupDomTree,
	SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {

	bool Changed = false;

	std::vector<Function *> AllCallsCold;
	for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
	Function F = &FI++;
	if (hasOnlyColdCalls(*F, GetBFI))
	AllCallsCold.push_back(F);
	}

	// Optimize functions.
	for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
	Function F = &FI++;

	// Don't perform global opt pass on naked functions; we don't want fast
	// calling conventions for naked functions.
	if (F->hasFnAttribute(Attribute::Naked))
	continue;

	// Functions without names cannot be referenced outside this module.
	if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
	F->setLinkage(GlobalValue::InternalLinkage);

	if (deleteIfDead(*F, NotDiscardableComdats)) {
	Changed = true;
	continue;
	}

	// LLVM's definition of dominance allows instructions that are cyclic
	// in unreachable blocks, e.g.:
	// %pat = select i1 %condition, @global, i16* %pat
	// because any instruction dominates an instruction in a block that's
	// not reachable from entry.
	// So, remove unreachable blocks from the function, because a) there's
	// no point in analyzing them and b) GlobalOpt should otherwise grow
	// some more complicated logic to break these cycles.
	// Removing unreachable blocks might invalidate the dominator so we
	// recalculate it.
	if (!F->isDeclaration()) {
	if (removeUnreachableBlocks(*F)) {
	auto &DT = LookupDomTree(*F);
	DT.recalculate(*F);
	Changed = true;
	}
	}

	Changed \|= processGlobal(*F, GetTLI, LookupDomTree);

	if (!F->hasLocalLinkage())
	continue;

	// If we have an inalloca parameter that we can safely remove the
	// inalloca attribute from, do so. This unlocks optimizations that
	// wouldn't be safe in the presence of inalloca.
	// FIXME: We should also hoist alloca affected by this to the entry
	// block if possible.
	if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
	!F->hasAddressTaken() && !hasMustTailCallers(F)) {
	RemoveAttribute(F, Attribute::InAlloca);
	Changed = true;
	}

	// FIXME: handle invokes
	// FIXME: handle musttail
	if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
	if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
	!hasInvokeCallers(F)) {
	RemovePreallocated(F);
	Changed = true;
	}
	continue;
	}

	if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
	NumInternalFunc++;
	TargetTransformInfo &TTI = GetTTI(*F);
	// Change the calling convention to coldcc if either stress testing is
	// enabled or the target would like to use coldcc on functions which are
	// cold at all call sites and the callers contain no other non coldcc
	// calls.
	if (EnableColdCCStressTest \|\|
	(TTI.useColdCCForColdCall(*F) &&
	isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) {
	F->setCallingConv(CallingConv::Cold);
	changeCallSitesToColdCC(F);
	Changed = true;
	NumColdCC++;
	}
	}

	if (hasChangeableCC(F) && !F->isVarArg() &&
	!F->hasAddressTaken()) {
	// If this function has a calling convention worth changing, is not a
	// varargs function, and is only called directly, promote it to use the
	// Fast calling convention.
	F->setCallingConv(CallingConv::Fast);
	ChangeCalleesToFastCall(F);
	++NumFastCallFns;
	Changed = true;
	}

	if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
	!F->hasAddressTaken()) {
	// The function is not used by a trampoline intrinsic, so it is safe
	// to remove the 'nest' attribute.
	RemoveAttribute(F, Attribute::Nest);
	++NumNestRemoved;
	Changed = true;
	}
	}
	return Changed;
	}

	static bool
	OptimizeGlobalVars(Module &M,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI,
	function_ref<DominatorTree &(Function &)> LookupDomTree,
	SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
	bool Changed = false;

	for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
	GVI != E; ) {
	GlobalVariable GV = &GVI++;
	// Global variables without names cannot be referenced outside this module.
	if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
	GV->setLinkage(GlobalValue::InternalLinkage);
	// Simplify the initializer.
	if (GV->hasInitializer())
	if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
	auto &DL = M.getDataLayout();
	// TLI is not used in the case of a Constant, so use default nullptr
	// for that optional parameter, since we don't have a Function to
	// provide GetTLI anyway.
	Constant New = ConstantFoldConstant(C, DL, /TLI*/ nullptr);
	if (New != C)
	GV->setInitializer(New);
	}

	if (deleteIfDead(*GV, NotDiscardableComdats)) {
	Changed = true;
	continue;
	}

	Changed \|= processGlobal(*GV, GetTLI, LookupDomTree);
	}
	return Changed;
	}

	/// Evaluate a piece of a constantexpr store into a global initializer. This
	/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the
	/// GEP operands of Addr [0, OpNo) have been stepped into.
	static Constant EvaluateStoreInto(Constant Init, Constant *Val,
	ConstantExpr *Addr, unsigned OpNo) {
	// Base case of the recursion.
	if (OpNo == Addr->getNumOperands()) {
	assert(Val->getType() == Init->getType() && "Type mismatch!");
	return Val;
	}

	SmallVector<Constant*, 32> Elts;
	if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
	// Break up the constant into its elements.
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
	Elts.push_back(Init->getAggregateElement(i));

	// Replace the element that we are supposed to.
	ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
	unsigned Idx = CU->getZExtValue();
	assert(Idx < STy->getNumElements() && "Struct index out of range!");
	Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);

	// Return the modified struct.
	return ConstantStruct::get(STy, Elts);
	}

	ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
	uint64_t NumElts;
	if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
	NumElts = ATy->getNumElements();
	else
	NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();

	// Break up the array into elements.
	for (uint64_t i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(Init->getAggregateElement(i));

	assert(CI->getZExtValue() < NumElts);
	Elts[CI->getZExtValue()] =
	EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);

	if (Init->getType()->isArrayTy())
	return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
	return ConstantVector::get(Elts);
	}

	/// We have decided that Addr (which satisfies the predicate
	/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen.
	static void CommitValueTo(Constant Val, Constant Addr) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
	assert(GV->hasInitializer());
	GV->setInitializer(Val);
	return;
	}

	ConstantExpr *CE = cast<ConstantExpr>(Addr);
	GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
	GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
	}

	/// Given a map of address -> value, where addresses are expected to be some form
	/// of either a global or a constant GEP, set the initializer for the address to
	/// be the value. This performs mostly the same function as CommitValueTo()
	/// and EvaluateStoreInto() but is optimized to be more efficient for the common
	/// case where the set of addresses are GEPs sharing the same underlying global,
	/// processing the GEPs in batches rather than individually.
	///
	/// To give an example, consider the following C++ code adapted from the clang
	/// regression tests:
	/// struct S {
	/// int n = 10;
	/// int m = 2 * n;
	/// S(int a) : n(a) {}
	/// };
	///
	/// template<typename T>
	/// struct U {
	/// T *r = &q;
	/// T q = 42;
	/// U *p = this;
	/// };
	///
	/// U<S> e;
	///
	/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
	/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
	/// members. This batch algorithm will simply use general CommitValueTo() method
	/// to handle the complex nested S struct initialization of 'q', before
	/// processing the outermost members in a single batch. Using CommitValueTo() to
	/// handle member in the outer struct is inefficient when the struct/array is
	/// very large as we end up creating and destroy constant arrays for each
	/// initialization.
	/// For the above case, we expect the following IR to be generated:
	///
	/// %struct.U = type { %struct.S, %struct.S, %struct.U }
	/// %struct.S = type { i32, i32 }
	/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
	/// i64 0, i32 1),
	/// %struct.S { i32 42, i32 84 }, %struct.U* @e }
	/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
	/// constant expression, while the other two elements of @e are "simple".
	static void BatchCommitValueTo(const DenseMap<Constant, Constant> &Mem) {
	SmallVector<std::pair<GlobalVariable, Constant>, 32> GVs;
	SmallVector<std::pair<ConstantExpr, Constant>, 32> ComplexCEs;
	SmallVector<std::pair<ConstantExpr, Constant>, 32> SimpleCEs;
	SimpleCEs.reserve(Mem.size());

	for (const auto &I : Mem) {
	if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
	GVs.push_back(std::make_pair(GV, I.second));
	} else {
	ConstantExpr *GEP = cast<ConstantExpr>(I.first);
	// We don't handle the deeply recursive case using the batch method.
	if (GEP->getNumOperands() > 3)
	ComplexCEs.push_back(std::make_pair(GEP, I.second));
	else
	SimpleCEs.push_back(std::make_pair(GEP, I.second));
	}
	}

	// The algorithm below doesn't handle cases like nested structs, so use the
	// slower fully general method if we have to.
	for (auto ComplexCE : ComplexCEs)
	CommitValueTo(ComplexCE.second, ComplexCE.first);

	for (auto GVPair : GVs) {
	assert(GVPair.first->hasInitializer());
	GVPair.first->setInitializer(GVPair.second);
	}

	if (SimpleCEs.empty())
	return;

	// We cache a single global's initializer elements in the case where the
	// subsequent address/val pair uses the same one. This avoids throwing away and
	// rebuilding the constant struct/vector/array just because one element is
	// modified at a time.
	SmallVector<Constant *, 32> Elts;
	Elts.reserve(SimpleCEs.size());
	GlobalVariable *CurrentGV = nullptr;

	auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (Update) {
	if (CurrentGV) {
	assert(CurrentGV && "Expected a GV to commit to!");
	Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
	// We have a valid cache that needs to be committed.
	if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
	CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
	else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
	CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
	else
	CurrentGV->setInitializer(ConstantVector::get(Elts));
	}
	if (CurrentGV == GV)
	return;
	// Need to clear and set up cache for new initializer.
	CurrentGV = GV;
	Elts.clear();
	unsigned NumElts;
	if (auto *STy = dyn_cast<StructType>(Ty))
	NumElts = STy->getNumElements();
	else if (auto *ATy = dyn_cast<ArrayType>(Ty))
	NumElts = ATy->getNumElements();
	else
	NumElts = cast<FixedVectorType>(Ty)->getNumElements();
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(Init->getAggregateElement(i));
	}
	};

	for (auto CEPair : SimpleCEs) {
	ConstantExpr *GEP = CEPair.first;
	Constant *Val = CEPair.second;

	GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
	commitAndSetupCache(GV, GV != CurrentGV);
	ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
	Elts[CI->getZExtValue()] = Val;
	}
	// The last initializer in the list needs to be committed, others
	// will be committed on a new initializer being processed.
	commitAndSetupCache(CurrentGV, true);
	}

	/// Evaluate static constructors in the function, if we can. Return true if we
	/// can, false otherwise.
	static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
	TargetLibraryInfo *TLI) {
	// Call the function.
	Evaluator Eval(DL, TLI);
	Constant *RetValDummy;
	bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
	SmallVector<Constant*, 0>());

	if (EvalSuccess) {
	++NumCtorsEvaluated;

	// We succeeded at evaluation: commit the result.
	LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
	<< F->getName() << "' to "
	<< Eval.getMutatedMemory().size() << " stores.\n");
	BatchCommitValueTo(Eval.getMutatedMemory());
	for (GlobalVariable *GV : Eval.getInvariants())
	GV->setConstant(true);
	}

	return EvalSuccess;
	}

	static int compareNames(Constant const A, Constant const B) {
	Value AStripped = (A)->stripPointerCasts();
	Value BStripped = (B)->stripPointerCasts();
	return AStripped->getName().compare(BStripped->getName());
	}

	static void setUsedInitializer(GlobalVariable &V,
	const SmallPtrSetImpl<GlobalValue *> &Init) {
	if (Init.empty()) {
	V.eraseFromParent();
	return;
	}

	// Type of pointer to the array of pointers.
	PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);

	SmallVector<Constant *, 8> UsedArray;
	for (GlobalValue *GV : Init) {
	Constant *Cast
	= ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
	UsedArray.push_back(Cast);
	}
	// Sort to get deterministic order.
	array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
	ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());

	Module *M = V.getParent();
	V.removeFromParent();
	GlobalVariable *NV =
	new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
	ConstantArray::get(ATy, UsedArray), "");
	NV->takeName(&V);
	NV->setSection("llvm.metadata");
	delete &V;
	}

	namespace {

	/// An easy to access representation of llvm.used and llvm.compiler.used.
	class LLVMUsed {
	SmallPtrSet<GlobalValue *, 8> Used;
	SmallPtrSet<GlobalValue *, 8> CompilerUsed;
	GlobalVariable *UsedV;
	GlobalVariable *CompilerUsedV;

	public:
	LLVMUsed(Module &M) {
	UsedV = collectUsedGlobalVariables(M, Used, false);
	CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
	}

	using iterator = SmallPtrSet<GlobalValue *, 8>::iterator;
	using used_iterator_range = iterator_range<iterator>;

	iterator usedBegin() { return Used.begin(); }
	iterator usedEnd() { return Used.end(); }

	used_iterator_range used() {
	return used_iterator_range(usedBegin(), usedEnd());
	}

	iterator compilerUsedBegin() { return CompilerUsed.begin(); }
	iterator compilerUsedEnd() { return CompilerUsed.end(); }

	used_iterator_range compilerUsed() {
	return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
	}

	bool usedCount(GlobalValue *GV) const { return Used.count(GV); }

	bool compilerUsedCount(GlobalValue *GV) const {
	return CompilerUsed.count(GV);
	}

	bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
	bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
	bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }

	bool compilerUsedInsert(GlobalValue *GV) {
	return CompilerUsed.insert(GV).second;
	}

	void syncVariablesAndSets() {
	if (UsedV)
	setUsedInitializer(*UsedV, Used);
	if (CompilerUsedV)
	setUsedInitializer(*CompilerUsedV, CompilerUsed);
	}
	};

	} // end anonymous namespace

	static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
	if (GA.use_empty()) // No use at all.
	return false;

	assert((!U.usedCount(&GA) \|\| !U.compilerUsedCount(&GA)) &&
	"We should have removed the duplicated "
	"element from llvm.compiler.used");
	if (!GA.hasOneUse())
	// Strictly more than one use. So at least one is not in llvm.used and
	// llvm.compiler.used.
	return true;

	// Exactly one use. Check if it is in llvm.used or llvm.compiler.used.
	return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
	}

	static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
	const LLVMUsed &U) {
	unsigned N = 2;
	assert((!U.usedCount(&V) \|\| !U.compilerUsedCount(&V)) &&
	"We should have removed the duplicated "
	"element from llvm.compiler.used");
	if (U.usedCount(&V) \|\| U.compilerUsedCount(&V))
	++N;
	return V.hasNUsesOrMore(N);
	}

	static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
	if (!GA.hasLocalLinkage())
	return true;

	return U.usedCount(&GA) \|\| U.compilerUsedCount(&GA);
	}

	static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
	bool &RenameTarget) {
	RenameTarget = false;
	bool Ret = false;
	if (hasUseOtherThanLLVMUsed(GA, U))
	Ret = true;

	// If the alias is externally visible, we may still be able to simplify it.
	if (!mayHaveOtherReferences(GA, U))
	return Ret;

	// If the aliasee has internal linkage, give it the name and linkage
	// of the alias, and delete the alias. This turns:
	// define internal ... @f(...)
	// @a = alias ... @f
	// into:
	// define ... @a(...)
	Constant *Aliasee = GA.getAliasee();
	GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
	if (!Target->hasLocalLinkage())
	return Ret;

	// Do not perform the transform if multiple aliases potentially target the
	// aliasee. This check also ensures that it is safe to replace the section
	// and other attributes of the aliasee with those of the alias.
	if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
	return Ret;

	RenameTarget = true;
	return true;
	}

	static bool
	OptimizeGlobalAliases(Module &M,
	SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
	bool Changed = false;
	LLVMUsed Used(M);

	for (GlobalValue *GV : Used.used())
	Used.compilerUsedErase(GV);

	for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
	I != E;) {
	GlobalAlias J = &I++;

	// Aliases without names cannot be referenced outside this module.
	if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
	J->setLinkage(GlobalValue::InternalLinkage);

	if (deleteIfDead(*J, NotDiscardableComdats)) {
	Changed = true;
	continue;
	}

	// If the alias can change at link time, nothing can be done - bail out.
	if (J->isInterposable())
	continue;

	Constant *Aliasee = J->getAliasee();
	GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
	// We can't trivially replace the alias with the aliasee if the aliasee is
	// non-trivial in some way.
	// TODO: Try to handle non-zero GEPs of local aliasees.
	if (!Target)
	continue;
	Target->removeDeadConstantUsers();

	// Make all users of the alias use the aliasee instead.
	bool RenameTarget;
	if (!hasUsesToReplace(*J, Used, RenameTarget))
	continue;

	J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
	++NumAliasesResolved;
	Changed = true;

	if (RenameTarget) {
	// Give the aliasee the name, linkage and other attributes of the alias.
	Target->takeName(&*J);
	Target->setLinkage(J->getLinkage());
	Target->setDSOLocal(J->isDSOLocal());
	Target->setVisibility(J->getVisibility());
	Target->setDLLStorageClass(J->getDLLStorageClass());

	if (Used.usedErase(&*J))
	Used.usedInsert(Target);

	if (Used.compilerUsedErase(&*J))
	Used.compilerUsedInsert(Target);
	} else if (mayHaveOtherReferences(*J, Used))
	continue;

	// Delete the alias.
	M.getAliasList().erase(J);
	++NumAliasesRemoved;
	Changed = true;
	}

	Used.syncVariablesAndSets();

	return Changed;
	}

	static Function *
	FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
	// Hack to get a default TLI before we have actual Function.
	auto FuncIter = M.begin();
	if (FuncIter == M.end())
	return nullptr;
	auto TLI = &GetTLI(FuncIter);

	LibFunc F = LibFunc_cxa_atexit;
	if (!TLI->has(F))
	return nullptr;

	Function *Fn = M.getFunction(TLI->getName(F));
	if (!Fn)
	return nullptr;

	// Now get the actual TLI for Fn.
	TLI = &GetTLI(*Fn);

	// Make sure that the function has the correct prototype.
	if (!TLI->getLibFunc(*Fn, F) \|\| F != LibFunc_cxa_atexit)
	return nullptr;

	return Fn;
	}

	/// Returns whether the given function is an empty C++ destructor and can
	/// therefore be eliminated.
	/// Note that we assume that other optimization passes have already simplified
	/// the code so we simply check for 'ret'.
	static bool cxxDtorIsEmpty(const Function &Fn) {
	// FIXME: We could eliminate C++ destructors if they're readonly/readnone and
	// nounwind, but that doesn't seem worth doing.
	if (Fn.isDeclaration())
	return false;

	for (auto &I : Fn.getEntryBlock()) {
	if (isa<DbgInfoIntrinsic>(I))
	continue;
	if (isa<ReturnInst>(I))
	return true;
	break;
	}
	return false;
	}

	static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
	/// Itanium C++ ABI p3.3.5:
	///
	/// After constructing a global (or local static) object, that will require
	/// destruction on exit, a termination function is registered as follows:
	///
	/// extern "C" int __cxa_atexit ( void (f)(void ), void p, void d );
	///
	/// This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the
	/// call f(p) when DSO d is unloaded, before all such termination calls
	/// registered before this one. It returns zero if registration is
	/// successful, nonzero on failure.

	// This pass will look for calls to __cxa_atexit where the function is trivial
	// and remove them.
	bool Changed = false;

	for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
	I != E;) {
	// We're only interested in calls. Theoretically, we could handle invoke
	// instructions as well, but neither llvm-gcc nor clang generate invokes
	// to __cxa_atexit.
	CallInst CI = dyn_cast<CallInst>(I++);
	if (!CI)
	continue;

	Function *DtorFn =
	dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
	if (!DtorFn \|\| !cxxDtorIsEmpty(*DtorFn))
	continue;

	// Just remove the call.
	CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
	CI->eraseFromParent();

	++NumCXXDtorsRemoved;

	Changed \|= true;
	}

	return Changed;
	}

	static bool optimizeGlobalsInModule(
	Module &M, const DataLayout &DL,
	function_ref<TargetLibraryInfo &(Function &)> GetTLI,
	function_ref<TargetTransformInfo &(Function &)> GetTTI,
	function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
	function_ref<DominatorTree &(Function &)> LookupDomTree) {
	SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
	bool Changed = false;
	bool LocalChange = true;
	while (LocalChange) {
	LocalChange = false;

	NotDiscardableComdats.clear();
	for (const GlobalVariable &GV : M.globals())
	if (const Comdat *C = GV.getComdat())
	if (!GV.isDiscardableIfUnused() \|\| !GV.use_empty())
	NotDiscardableComdats.insert(C);
	for (Function &F : M)
	if (const Comdat *C = F.getComdat())
	if (!F.isDefTriviallyDead())
	NotDiscardableComdats.insert(C);
	for (GlobalAlias &GA : M.aliases())
	if (const Comdat *C = GA.getComdat())
	if (!GA.isDiscardableIfUnused() \|\| !GA.use_empty())
	NotDiscardableComdats.insert(C);

	// Delete functions that are trivially dead, ccc -> fastcc
	LocalChange \|= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
	NotDiscardableComdats);

	// Optimize global_ctors list.
	LocalChange \|= optimizeGlobalCtorsList(M, [&](Function *F) {
	return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
	});

	// Optimize non-address-taken globals.
	LocalChange \|=
	OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);

	// Resolve aliases, when possible.
	LocalChange \|= OptimizeGlobalAliases(M, NotDiscardableComdats);

	// Try to remove trivial global destructors if they are not removed
	// already.
	Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
	if (CXAAtExitFn)
	LocalChange \|= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);

	Changed \|= LocalChange;
	}

	// TODO: Move all global ctors functions to the end of the module for code
	// layout.

	return Changed;
	}

	PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
	auto &DL = M.getDataLayout();
	auto &FAM =
	AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
	auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
	return FAM.getResult<DominatorTreeAnalysis>(F);
	};
	auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
	return FAM.getResult<TargetLibraryAnalysis>(F);
	};
	auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
	return FAM.getResult<TargetIRAnalysis>(F);
	};

	auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
	return FAM.getResult<BlockFrequencyAnalysis>(F);
	};

	if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
	return PreservedAnalyses::all();
	return PreservedAnalyses::none();
	}

	namespace {

	struct GlobalOptLegacyPass : public ModulePass {
	static char ID; // Pass identification, replacement for typeid

	GlobalOptLegacyPass() : ModulePass(ID) {
	initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnModule(Module &M) override {
	if (skipModule(M))
	return false;

	auto &DL = M.getDataLayout();
	auto LookupDomTree = [this](Function &F) -> DominatorTree & {
	return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
	};
	auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
	return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
	};
	auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
	return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	};

	auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
	return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
	};

	return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
	LookupDomTree);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<BlockFrequencyInfoWrapperPass>();
	}
	};

	} // end anonymous namespace

	char GlobalOptLegacyPass::ID = 0;

	INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
	"Global Variable Optimizer", false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
	"Global Variable Optimizer", false, false)

	ModulePass *llvm::createGlobalOptimizerPass() {
	return new GlobalOptLegacyPass();
	}
	diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
	index c6233a68847d..2f1325e80d2f 100644
	--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
	+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
	@@ -1,1539 +1,1543 @@
	//===- InstCombineMulDivRem.cpp -------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv,
	// srem, urem, frem.
	//
	//===----------------------------------------------------------------------===//

	#include "InstCombineInternal.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
	#include "llvm/Transforms/Utils/BuildLibCalls.h"
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <utility>

	using namespace llvm;
	using namespace PatternMatch;

	#define DEBUG_TYPE "instcombine"

	/// The specific integer value is used in a context where it is known to be
	/// non-zero. If this allows us to simplify the computation, do so and return
	/// the new operand, otherwise return null.
	static Value simplifyValueKnownNonZero(Value V, InstCombiner &IC,
	Instruction &CxtI) {
	// If V has multiple uses, then we would have to do more analysis to determine
	// if this is safe. For example, the use could be in dynamically unreached
	// code.
	if (!V->hasOneUse()) return nullptr;

	bool MadeChange = false;

	// ((1 << A) >>u B) --> (1 << (A-B))
	// Because V cannot be zero, we know that B is less than A.
	Value A = nullptr, B = nullptr, *One = nullptr;
	if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
	match(One, m_One())) {
	A = IC.Builder.CreateSub(A, B);
	return IC.Builder.CreateShl(One, A);
	}

	// (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
	// inexact. Similarly for <<.
	BinaryOperator *I = dyn_cast<BinaryOperator>(V);
	if (I && I->isLogicalShift() &&
	IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
	// We know that this is an exact/nuw shift and that the input is a
	// non-zero context as well.
	if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
	IC.replaceOperand(*I, 0, V2);
	MadeChange = true;
	}

	if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
	I->setIsExact();
	MadeChange = true;
	}

	if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
	I->setHasNoUnsignedWrap();
	MadeChange = true;
	}
	}

	// TODO: Lots more we could do here:
	// If V is a phi node, we can call this on each of its operands.
	// "select cond, X, 0" can simplify to "X".

	return MadeChange ? V : nullptr;
	}

	/// A helper routine of InstCombiner::visitMul().
	///
	/// If C is a scalar/fixed width vector of known powers of 2, then this
	/// function returns a new scalar/fixed width vector obtained from logBase2
	/// of C.
	/// Return a null pointer otherwise.
	static Constant getLogBase2(Type Ty, Constant *C) {
	const APInt *IVal;
	if (match(C, m_APInt(IVal)) && IVal->isPowerOf2())
	return ConstantInt::get(Ty, IVal->logBase2());

	// FIXME: We can extract pow of 2 of splat constant for scalable vectors.
	if (!isa<FixedVectorType>(Ty))
	return nullptr;

	SmallVector<Constant *, 4> Elts;
	for (unsigned I = 0, E = cast<FixedVectorType>(Ty)->getNumElements(); I != E;
	++I) {
	Constant *Elt = C->getAggregateElement(I);
	if (!Elt)
	return nullptr;
	if (isa<UndefValue>(Elt)) {
	Elts.push_back(UndefValue::get(Ty->getScalarType()));
	continue;
	}
	if (!match(Elt, m_APInt(IVal)) \|\| !IVal->isPowerOf2())
	return nullptr;
	Elts.push_back(ConstantInt::get(Ty->getScalarType(), IVal->logBase2()));
	}

	return ConstantVector::get(Elts);
	}

	// TODO: This is a specific form of a much more general pattern.
	// We could detect a select with any binop identity constant, or we
	// could use SimplifyBinOp to see if either arm of the select reduces.
	// But that needs to be done carefully and/or while removing potential
	// reverse canonicalizations as in InstCombiner::foldSelectIntoOp().
	static Value *foldMulSelectToNegate(BinaryOperator &I,
	InstCombiner::BuilderTy &Builder) {
	Value Cond, OtherOp;

	// mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
	// mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
	if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
	m_Value(OtherOp))))
	return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));

	// mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
	// mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
	if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
	m_Value(OtherOp))))
	return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);

	// fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
	// fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
	if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
	m_SpecificFP(-1.0))),
	m_Value(OtherOp)))) {
	IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
	Builder.setFastMathFlags(I.getFastMathFlags());
	return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
	}

	// fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
	// fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
	if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
	m_SpecificFP(1.0))),
	m_Value(OtherOp)))) {
	IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
	Builder.setFastMathFlags(I.getFastMathFlags());
	return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitMul(BinaryOperator &I) {
	if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (SimplifyAssociativeOrCommutative(I))
	return &I;

	if (Instruction *X = foldVectorBinop(I))
	return X;

	if (Value *V = SimplifyUsingDistributiveLaws(I))
	return replaceInstUsesWith(I, V);

	// X * -1 == 0 - X
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	if (match(Op1, m_AllOnes())) {
	BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
	if (I.hasNoSignedWrap())
	BO->setHasNoSignedWrap();
	return BO;
	}

	// Also allow combining multiply instructions on vectors.
	{
	Value *NewOp;
	Constant C1, C2;
	const APInt *IVal;
	if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
	m_Constant(C1))) &&
	match(C1, m_APInt(IVal))) {
	// ((X << C2)C1) == (X (C1 << C2))
	Constant *Shl = ConstantExpr::getShl(C1, C2);
	BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0));
	BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl);
	if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap())
	BO->setHasNoUnsignedWrap();
	if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() &&
	Shl->isNotMinSignedValue())
	BO->setHasNoSignedWrap();
	return BO;
	}

	if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
	// Replace X*(2^C) with X << C, where C is either a scalar or a vector.
	- if (Constant *NewCst = getLogBase2(NewOp->getType(), C1)) {
	+ // Note that we need to sanitize undef multipliers to 1,
	+ // to avoid introducing poison.
	+ Constant *SafeC1 = Constant::replaceUndefsWith(
	+ C1, ConstantInt::get(C1->getType()->getScalarType(), 1));
	+ if (Constant *NewCst = getLogBase2(NewOp->getType(), SafeC1)) {
	BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);

	if (I.hasNoUnsignedWrap())
	Shl->setHasNoUnsignedWrap();
	if (I.hasNoSignedWrap()) {
	const APInt *V;
	if (match(NewCst, m_APInt(V)) && *V != V->getBitWidth() - 1)
	Shl->setHasNoSignedWrap();
	}

	return Shl;
	}
	}
	}

	if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
	// (Y - X) * (-(2*n)) -> (X - Y) (2**n), for positive nonzero n
	// (Y + const) * (-(2*n)) -> (-constY) (2**n), for positive nonzero n
	// The "* (2**n)" thus becomes a potential shifting opportunity.
	{
	const APInt & Val = CI->getValue();
	const APInt &PosVal = Val.abs();
	if (Val.isNegative() && PosVal.isPowerOf2()) {
	Value X = nullptr, Y = nullptr;
	if (Op0->hasOneUse()) {
	ConstantInt *C1;
	Value *Sub = nullptr;
	if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
	Sub = Builder.CreateSub(X, Y, "suba");
	else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
	Sub = Builder.CreateSub(Builder.CreateNeg(C1), Y, "subc");
	if (Sub)
	return
	BinaryOperator::CreateMul(Sub,
	ConstantInt::get(Y->getType(), PosVal));
	}
	}
	}
	}

	if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
	return FoldedMul;

	if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
	return replaceInstUsesWith(I, FoldedMul);

	// Simplify mul instructions with a constant RHS.
	if (isa<Constant>(Op1)) {
	// Canonicalize (X+C1)CI -> XCI+C1*CI.
	Value *X;
	Constant *C1;
	if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
	Value *Mul = Builder.CreateMul(C1, Op1);
	// Only go forward with the transform if C1*CI simplifies to a tidier
	// constant.
	if (!match(Mul, m_Mul(m_Value(), m_Value())))
	return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
	}
	}

	// abs(X) * abs(X) -> X * X
	// nabs(X) * nabs(X) -> X * X
	if (Op0 == Op1) {
	Value X, Y;
	SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
	if (SPF == SPF_ABS \|\| SPF == SPF_NABS)
	return BinaryOperator::CreateMul(X, X);
	}

	// -X * C --> X * -C
	Value X, Y;
	Constant *Op1C;
	if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
	return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));

	// -X * -Y --> X * Y
	if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) {
	auto *NewMul = BinaryOperator::CreateMul(X, Y);
	if (I.hasNoSignedWrap() &&
	cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() &&
	cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap())
	NewMul->setHasNoSignedWrap();
	return NewMul;
	}

	// -X * Y --> -(X * Y)
	// X * -Y --> -(X * Y)
	if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y))))
	return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y));

	// (X / Y) * Y = X - (X % Y)
	// (X / Y) * -Y = (X % Y) - X
	{
	Value *Y = Op1;
	BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
	if (!Div \|\| (Div->getOpcode() != Instruction::UDiv &&
	Div->getOpcode() != Instruction::SDiv)) {
	Y = Op0;
	Div = dyn_cast<BinaryOperator>(Op1);
	}
	Value *Neg = dyn_castNegVal(Y);
	if (Div && Div->hasOneUse() &&
	(Div->getOperand(1) == Y \|\| Div->getOperand(1) == Neg) &&
	(Div->getOpcode() == Instruction::UDiv \|\|
	Div->getOpcode() == Instruction::SDiv)) {
	Value X = Div->getOperand(0), DivOp1 = Div->getOperand(1);

	// If the division is exact, X % Y is zero, so we end up with X or -X.
	if (Div->isExact()) {
	if (DivOp1 == Y)
	return replaceInstUsesWith(I, X);
	return BinaryOperator::CreateNeg(X);
	}

	auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
	: Instruction::SRem;
	Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
	if (DivOp1 == Y)
	return BinaryOperator::CreateSub(X, Rem);
	return BinaryOperator::CreateSub(Rem, X);
	}
	}

	/// i1 mul -> i1 and.
	if (I.getType()->isIntOrIntVectorTy(1))
	return BinaryOperator::CreateAnd(Op0, Op1);

	// X*(1 << Y) --> X << Y
	// (1 << Y)*X --> X << Y
	{
	Value *Y;
	BinaryOperator *BO = nullptr;
	bool ShlNSW = false;
	if (match(Op0, m_Shl(m_One(), m_Value(Y)))) {
	BO = BinaryOperator::CreateShl(Op1, Y);
	ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap();
	} else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) {
	BO = BinaryOperator::CreateShl(Op0, Y);
	ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap();
	}
	if (BO) {
	if (I.hasNoUnsignedWrap())
	BO->setHasNoUnsignedWrap();
	if (I.hasNoSignedWrap() && ShlNSW)
	BO->setHasNoSignedWrap();
	return BO;
	}
	}

	// (zext bool X) * (zext bool Y) --> zext (and X, Y)
	// (sext bool X) * (sext bool Y) --> zext (and X, Y)
	// Note: -1 * -1 == 1 * 1 == 1 (if the extends match, the result is the same)
	if (((match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) \|\|
	(match(Op0, m_SExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
	X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse())) {
	Value *And = Builder.CreateAnd(X, Y, "mulbool");
	return CastInst::Create(Instruction::ZExt, And, I.getType());
	}
	// (sext bool X) * (zext bool Y) --> sext (and X, Y)
	// (zext bool X) * (sext bool Y) --> sext (and X, Y)
	// Note: -1 * 1 == 1 * -1 == -1
	if (((match(Op0, m_SExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) \|\|
	(match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
	X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse())) {
	Value *And = Builder.CreateAnd(X, Y, "mulbool");
	return CastInst::Create(Instruction::SExt, And, I.getType());
	}

	// (bool X) * Y --> X ? Y : 0
	// Y * (bool X) --> X ? Y : 0
	if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
	return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
	if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
	return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));

	// (lshr X, 31) * Y --> (ashr X, 31) & Y
	// Y * (lshr X, 31) --> (ashr X, 31) & Y
	// TODO: We are not checking one-use because the elimination of the multiply
	// is better for analysis?
	// TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
	// more similar to what we're doing above.
	const APInt *C;
	if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
	return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
	if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
	return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);

	if (Instruction *Ext = narrowMathIfNoOverflow(I))
	return Ext;

	bool Changed = false;
	if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
	Changed = true;
	I.setHasNoSignedWrap(true);
	}

	if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
	Changed = true;
	I.setHasNoUnsignedWrap(true);
	}

	return Changed ? &I : nullptr;
	}

	Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
	BinaryOperator::BinaryOps Opcode = I.getOpcode();
	assert((Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv) &&
	"Expected fmul or fdiv");

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Value X, Y;

	// -X * -Y --> X * Y
	// -X / -Y --> X / Y
	if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
	return BinaryOperator::CreateWithCopiedFlags(Opcode, X, Y, &I);

	// fabs(X) * fabs(X) -> X * X
	// fabs(X) / fabs(X) -> X / X
	if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
	return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);

	// fabs(X) * fabs(Y) --> fabs(X * Y)
	// fabs(X) / fabs(Y) --> fabs(X / Y)
	if (match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))) &&
	match(Op1, m_Intrinsic<Intrinsic::fabs>(m_Value(Y))) &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse())) {
	IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
	Builder.setFastMathFlags(I.getFastMathFlags());
	Value *XY = Builder.CreateBinOp(Opcode, X, Y);
	Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY);
	Fabs->takeName(&I);
	return replaceInstUsesWith(I, Fabs);
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
	if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
	I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (SimplifyAssociativeOrCommutative(I))
	return &I;

	if (Instruction *X = foldVectorBinop(I))
	return X;

	if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
	return FoldedMul;

	if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
	return replaceInstUsesWith(I, FoldedMul);

	if (Instruction *R = foldFPSignBitOps(I))
	return R;

	// X * -1.0 --> -X
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	if (match(Op1, m_SpecificFP(-1.0)))
	return UnaryOperator::CreateFNegFMF(Op0, &I);

	// -X * C --> X * -C
	Value X, Y;
	Constant *C;
	if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
	return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);

	// (select A, B, C) * (select A, D, E) --> select A, (BD), (CE)
	if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
	return replaceInstUsesWith(I, V);

	if (I.hasAllowReassoc()) {
	// Reassociate constant RHS with another constant to form constant
	// expression.
	if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
	Constant *C1;
	if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
	// (C1 / X) * C --> (C * C1) / X
	Constant *CC1 = ConstantExpr::getFMul(C, C1);
	if (CC1->isNormalFP())
	return BinaryOperator::CreateFDivFMF(CC1, X, &I);
	}
	if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
	// (X / C1) * C --> X * (C / C1)
	Constant *CDivC1 = ConstantExpr::getFDiv(C, C1);
	if (CDivC1->isNormalFP())
	return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);

	// If the constant was a denormal, try reassociating differently.
	// (X / C1) * C --> X / (C1 / C)
	Constant *C1DivC = ConstantExpr::getFDiv(C1, C);
	if (Op0->hasOneUse() && C1DivC->isNormalFP())
	return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
	}

	// We do not need to match 'fadd C, X' and 'fsub X, C' because they are
	// canonicalized to 'fadd X, C'. Distributing the multiply may allow
	// further folds and (X * C) + C2 is 'fma'.
	if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) {
	// (X + C1) * C --> (X * C) + (C * C1)
	Constant *CC1 = ConstantExpr::getFMul(C, C1);
	Value *XC = Builder.CreateFMulFMF(X, C, &I);
	return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
	}
	if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
	// (C1 - X) * C --> (C * C1) - (X * C)
	Constant *CC1 = ConstantExpr::getFMul(C, C1);
	Value *XC = Builder.CreateFMulFMF(X, C, &I);
	return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
	}
	}

	Value *Z;
	if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))),
	m_Value(Z)))) {
	// Sink division: (X / Y) * Z --> (X * Z) / Y
	Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I);
	return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I);
	}

	// sqrt(X) * sqrt(Y) -> sqrt(X * Y)
	// nnan disallows the possibility of returning a number if both operands are
	// negative (in that case, we should return NaN).
	if (I.hasNoNaNs() &&
	match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
	match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
	Value *XY = Builder.CreateFMulFMF(X, Y, &I);
	Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I);
	return replaceInstUsesWith(I, Sqrt);
	}

	// Like the similar transform in instsimplify, this requires 'nsz' because
	// sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0.
	if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 &&
	Op0->hasNUses(2)) {
	// Peek through fdiv to find squaring of square root:
	// (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y
	if (match(Op0, m_FDiv(m_Value(X),
	m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
	Value *XX = Builder.CreateFMulFMF(X, X, &I);
	return BinaryOperator::CreateFDivFMF(XX, Y, &I);
	}
	// (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X)
	if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)),
	m_Value(X)))) {
	Value *XX = Builder.CreateFMulFMF(X, X, &I);
	return BinaryOperator::CreateFDivFMF(Y, XX, &I);
	}
	}

	// exp(X) * exp(Y) -> exp(X + Y)
	// Match as long as at least one of exp has only one use.
	if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
	match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y))) &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse())) {
	Value *XY = Builder.CreateFAddFMF(X, Y, &I);
	Value *Exp = Builder.CreateUnaryIntrinsic(Intrinsic::exp, XY, &I);
	return replaceInstUsesWith(I, Exp);
	}

	// exp2(X) * exp2(Y) -> exp2(X + Y)
	// Match as long as at least one of exp2 has only one use.
	if (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) &&
	match(Op1, m_Intrinsic<Intrinsic::exp2>(m_Value(Y))) &&
	(Op0->hasOneUse() \|\| Op1->hasOneUse())) {
	Value *XY = Builder.CreateFAddFMF(X, Y, &I);
	Value *Exp2 = Builder.CreateUnaryIntrinsic(Intrinsic::exp2, XY, &I);
	return replaceInstUsesWith(I, Exp2);
	}

	// (XY) X => (XX) Y where Y != X
	// The purpose is two-fold:
	// 1) to form a power expression (of X).
	// 2) potentially shorten the critical path: After transformation, the
	// latency of the instruction Y is amortized by the expression of X*X,
	// and therefore Y is in a "less critical" position compared to what it
	// was before the transformation.
	if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) &&
	Op1 != Y) {
	Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I);
	return BinaryOperator::CreateFMulFMF(XX, Y, &I);
	}
	if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) &&
	Op0 != Y) {
	Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I);
	return BinaryOperator::CreateFMulFMF(XX, Y, &I);
	}
	}

	// log2(X * 0.5) * Y = log2(X) * Y - Y
	if (I.isFast()) {
	IntrinsicInst *Log2 = nullptr;
	if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>(
	m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
	Log2 = cast<IntrinsicInst>(Op0);
	Y = Op1;
	}
	if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>(
	m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
	Log2 = cast<IntrinsicInst>(Op1);
	Y = Op0;
	}
	if (Log2) {
	Value *Log2 = Builder.CreateUnaryIntrinsic(Intrinsic::log2, X, &I);
	Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
	return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
	}
	}

	return nullptr;
	}

	/// Fold a divide or remainder with a select instruction divisor when one of the
	/// select operands is zero. In that case, we can use the other select operand
	/// because div/rem by zero is undefined.
	bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
	SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
	if (!SI)
	return false;

	int NonNullOperand;
	if (match(SI->getTrueValue(), m_Zero()))
	// div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
	NonNullOperand = 2;
	else if (match(SI->getFalseValue(), m_Zero()))
	// div/rem X, (Cond ? Y : 0) -> div/rem X, Y
	NonNullOperand = 1;
	else
	return false;

	// Change the div/rem to use 'Y' instead of the select.
	replaceOperand(I, 1, SI->getOperand(NonNullOperand));

	// Okay, we know we replace the operand of the div/rem with 'Y' with no
	// problem. However, the select, or the condition of the select may have
	// multiple uses. Based on our knowledge that the operand must be non-zero,
	// propagate the known value for the select into other uses of it, and
	// propagate a known value of the condition into its other users.

	// If the select and condition only have a single use, don't bother with this,
	// early exit.
	Value *SelectCond = SI->getCondition();
	if (SI->use_empty() && SelectCond->hasOneUse())
	return true;

	// Scan the current block backward, looking for other uses of SI.
	BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();
	Type *CondTy = SelectCond->getType();
	while (BBI != BBFront) {
	--BBI;
	// If we found an instruction that we can't assume will return, so
	// information from below it cannot be propagated above it.
	if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI))
	break;

	// Replace uses of the select or its condition with the known values.
	for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
	I != E; ++I) {
	if (*I == SI) {
	replaceUse(*I, SI->getOperand(NonNullOperand));
	Worklist.push(&*BBI);
	} else if (*I == SelectCond) {
	replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
	: ConstantInt::getFalse(CondTy));
	Worklist.push(&*BBI);
	}
	}

	// If we past the instruction, quit looking for it.
	if (&*BBI == SI)
	SI = nullptr;
	if (&*BBI == SelectCond)
	SelectCond = nullptr;

	// If we ran out of things to eliminate, break out of the loop.
	if (!SelectCond && !SI)
	break;

	}
	return true;
	}

	/// True if the multiply can not be expressed in an int this size.
	static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
	bool IsSigned) {
	bool Overflow;
	Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow);
	return Overflow;
	}

	/// True if C1 is a multiple of C2. Quotient contains C1/C2.
	static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
	bool IsSigned) {
	assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");

	// Bail if we will divide by zero.
	if (C2.isNullValue())
	return false;

	// Bail if we would divide INT_MIN by -1.
	if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
	return false;

	APInt Remainder(C1.getBitWidth(), /val=/0ULL, IsSigned);
	if (IsSigned)
	APInt::sdivrem(C1, C2, Quotient, Remainder);
	else
	APInt::udivrem(C1, C2, Quotient, Remainder);

	return Remainder.isMinValue();
	}

	/// This function implements the transforms common to both integer division
	/// instructions (udiv and sdiv). It is called by the visitors to those integer
	/// division instructions.
	/// Common integer divide transforms
	Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	bool IsSigned = I.getOpcode() == Instruction::SDiv;
	Type *Ty = I.getType();

	// The RHS is known non-zero.
	if (Value V = simplifyValueKnownNonZero(I.getOperand(1), this, I))
	return replaceOperand(I, 1, V);

	// Handle cases involving: [su]div X, (select Cond, Y, Z)
	// This does not apply for fdiv.
	if (simplifyDivRemOfSelectWithZeroOp(I))
	return &I;

	const APInt *C2;
	if (match(Op1, m_APInt(C2))) {
	Value *X;
	const APInt *C1;

	// (X / C1) / C2 -> X / (C1*C2)
	if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) \|\|
	(!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) {
	APInt Product(C1->getBitWidth(), /val=/0ULL, IsSigned);
	if (!multiplyOverflows(C1, C2, Product, IsSigned))
	return BinaryOperator::Create(I.getOpcode(), X,
	ConstantInt::get(Ty, Product));
	}

	if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) \|\|
	(!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) {
	APInt Quotient(C1->getBitWidth(), /val=/0ULL, IsSigned);

	// (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
	if (isMultiple(C2, C1, Quotient, IsSigned)) {
	auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X,
	ConstantInt::get(Ty, Quotient));
	NewDiv->setIsExact(I.isExact());
	return NewDiv;
	}

	// (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
	if (isMultiple(C1, C2, Quotient, IsSigned)) {
	auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
	ConstantInt::get(Ty, Quotient));
	auto *OBO = cast<OverflowingBinaryOperator>(Op0);
	Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
	Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
	return Mul;
	}
	}

	if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
	*C1 != C1->getBitWidth() - 1) \|\|
	(!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
	APInt Quotient(C1->getBitWidth(), /val=/0ULL, IsSigned);
	APInt C1Shifted = APInt::getOneBitSet(
	C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));

	// (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
	if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
	auto *BO = BinaryOperator::Create(I.getOpcode(), X,
	ConstantInt::get(Ty, Quotient));
	BO->setIsExact(I.isExact());
	return BO;
	}

	// (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2.
	if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
	auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
	ConstantInt::get(Ty, Quotient));
	auto *OBO = cast<OverflowingBinaryOperator>(Op0);
	Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
	Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
	return Mul;
	}
	}

	if (!C2->isNullValue()) // avoid X udiv 0
	if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
	return FoldedDiv;
	}

	if (match(Op0, m_One())) {
	assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
	if (IsSigned) {
	// If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
	// result is one, if Op1 is -1 then the result is minus one, otherwise
	// it's zero.
	Value *Inc = Builder.CreateAdd(Op1, Op0);
	Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
	return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
	} else {
	// If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
	// result is one, otherwise it's zero.
	return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty);
	}
	}

	// See if we can fold away this div instruction.
	if (SimplifyDemandedInstructionBits(I))
	return &I;

	// (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
	Value X, Z;
	if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1
	if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) \|\|
	(!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
	return BinaryOperator::Create(I.getOpcode(), X, Op1);

	// (X << Y) / X -> 1 << Y
	Value *Y;
	if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y))))
	return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y);
	if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y))))
	return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y);

	// X / (X * Y) -> 1 / Y if the multiplication does not overflow.
	if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) {
	bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
	bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
	if ((IsSigned && HasNSW) \|\| (!IsSigned && HasNUW)) {
	replaceOperand(I, 0, ConstantInt::get(Ty, 1));
	replaceOperand(I, 1, Y);
	return &I;
	}
	}

	return nullptr;
	}

	static const unsigned MaxDepth = 6;

	namespace {

	using FoldUDivOperandCb = Instruction ()(Value Op0, Value Op1,
	const BinaryOperator &I,
	InstCombiner &IC);

	/// Used to maintain state for visitUDivOperand().
	struct UDivFoldAction {
	/// Informs visitUDiv() how to fold this operand. This can be zero if this
	/// action joins two actions together.
	FoldUDivOperandCb FoldAction;

	/// Which operand to fold.
	Value *OperandToFold;

	union {
	/// The instruction returned when FoldAction is invoked.
	Instruction *FoldResult;

	/// Stores the LHS action index if this action joins two actions together.
	size_t SelectLHSIdx;
	};

	UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
	: FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
	UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
	: FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
	};

	} // end anonymous namespace

	// X udiv 2^C -> X >> C
	static Instruction foldUDivPow2Cst(Value Op0, Value *Op1,
	const BinaryOperator &I, InstCombiner &IC) {
	Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1));
	if (!C1)
	llvm_unreachable("Failed to constant fold udiv -> logbase2");
	BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
	if (I.isExact())
	LShr->setIsExact();
	return LShr;
	}

	// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
	// X udiv (zext (C1 << N)), where C1 is "1<<C2" --> X >> (N+C2)
	static Instruction foldUDivShl(Value Op0, Value *Op1, const BinaryOperator &I,
	InstCombiner &IC) {
	Value *ShiftLeft;
	if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
	ShiftLeft = Op1;

	Constant *CI;
	Value *N;
	if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
	llvm_unreachable("match should never fail here!");
	Constant *Log2Base = getLogBase2(N->getType(), CI);
	if (!Log2Base)
	llvm_unreachable("getLogBase2 should never fail here!");
	N = IC.Builder.CreateAdd(N, Log2Base);
	if (Op1 != ShiftLeft)
	N = IC.Builder.CreateZExt(N, Op1->getType());
	BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
	if (I.isExact())
	LShr->setIsExact();
	return LShr;
	}

	// Recursively visits the possible right hand operands of a udiv
	// instruction, seeing through select instructions, to determine if we can
	// replace the udiv with something simpler. If we find that an operand is not
	// able to simplify the udiv, we abort the entire transformation.
	static size_t visitUDivOperand(Value Op0, Value Op1, const BinaryOperator &I,
	SmallVectorImpl<UDivFoldAction> &Actions,
	unsigned Depth = 0) {
	// Check to see if this is an unsigned division with an exact power of 2,
	// if so, convert to a right shift.
	if (match(Op1, m_Power2())) {
	Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
	return Actions.size();
	}

	// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
	if (match(Op1, m_Shl(m_Power2(), m_Value())) \|\|
	match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
	Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
	return Actions.size();
	}

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth++ == MaxDepth)
	return 0;

	if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
	if (size_t LHSIdx =
	visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
	if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
	Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
	return Actions.size();
	}

	return 0;
	}

	/// If we have zero-extended operands of an unsigned div or rem, we may be able
	/// to narrow the operation (sink the zext below the math).
	static Instruction *narrowUDivURem(BinaryOperator &I,
	InstCombiner::BuilderTy &Builder) {
	Instruction::BinaryOps Opcode = I.getOpcode();
	Value *N = I.getOperand(0);
	Value *D = I.getOperand(1);
	Type *Ty = I.getType();
	Value X, Y;
	if (match(N, m_ZExt(m_Value(X))) && match(D, m_ZExt(m_Value(Y))) &&
	X->getType() == Y->getType() && (N->hasOneUse() \|\| D->hasOneUse())) {
	// udiv (zext X), (zext Y) --> zext (udiv X, Y)
	// urem (zext X), (zext Y) --> zext (urem X, Y)
	Value *NarrowOp = Builder.CreateBinOp(Opcode, X, Y);
	return new ZExtInst(NarrowOp, Ty);
	}

	Constant *C;
	if ((match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) \|\|
	(match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C)))) {
	// If the constant is the same in the smaller type, use the narrow version.
	Constant *TruncC = ConstantExpr::getTrunc(C, X->getType());
	if (ConstantExpr::getZExt(TruncC, Ty) != C)
	return nullptr;

	// udiv (zext X), C --> zext (udiv X, C')
	// urem (zext X), C --> zext (urem X, C')
	// udiv C, (zext X) --> zext (udiv C', X)
	// urem C, (zext X) --> zext (urem C', X)
	Value *NarrowOp = isa<Constant>(D) ? Builder.CreateBinOp(Opcode, X, TruncC)
	: Builder.CreateBinOp(Opcode, TruncC, X);
	return new ZExtInst(NarrowOp, Ty);
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
	if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	// Handle the integer div common cases
	if (Instruction *Common = commonIDivTransforms(I))
	return Common;

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Value *X;
	const APInt C1, C2;
	if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) {
	// (X lshr C1) udiv C2 --> X udiv (C2 << C1)
	bool Overflow;
	APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
	if (!Overflow) {
	bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
	BinaryOperator *BO = BinaryOperator::CreateUDiv(
	X, ConstantInt::get(X->getType(), C2ShlC1));
	if (IsExact)
	BO->setIsExact();
	return BO;
	}
	}

	// Op0 / C where C is large (negative) --> zext (Op0 >= C)
	// TODO: Could use isKnownNegative() to handle non-constant values.
	Type *Ty = I.getType();
	if (match(Op1, m_Negative())) {
	Value *Cmp = Builder.CreateICmpUGE(Op0, Op1);
	return CastInst::CreateZExtOrBitCast(Cmp, Ty);
	}
	// Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined)
	if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
	Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
	return CastInst::CreateZExtOrBitCast(Cmp, Ty);
	}

	if (Instruction *NarrowDiv = narrowUDivURem(I, Builder))
	return NarrowDiv;

	// If the udiv operands are non-overflowing multiplies with a common operand,
	// then eliminate the common factor:
	// (A * B) / (A * X) --> B / X (and commuted variants)
	// TODO: The code would be reduced if we had m_c_NUWMul pattern matching.
	// TODO: If -reassociation handled this generally, we could remove this.
	Value A, B;
	if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) {
	if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) \|\|
	match(Op1, m_NUWMul(m_Value(X), m_Specific(A))))
	return BinaryOperator::CreateUDiv(B, X);
	if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) \|\|
	match(Op1, m_NUWMul(m_Value(X), m_Specific(B))))
	return BinaryOperator::CreateUDiv(A, X);
	}

	// (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
	SmallVector<UDivFoldAction, 6> UDivActions;
	if (visitUDivOperand(Op0, Op1, I, UDivActions))
	for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
	FoldUDivOperandCb Action = UDivActions[i].FoldAction;
	Value *ActionOp1 = UDivActions[i].OperandToFold;
	Instruction *Inst;
	if (Action)
	Inst = Action(Op0, ActionOp1, I, *this);
	else {
	// This action joins two actions together. The RHS of this action is
	// simply the last action we processed, we saved the LHS action index in
	// the joining action.
	size_t SelectRHSIdx = i - 1;
	Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
	size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
	Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
	Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
	SelectLHS, SelectRHS);
	}

	// If this is the last action to process, return it to the InstCombiner.
	// Otherwise, we insert it before the UDiv and record it so that we may
	// use it as part of a joining action (i.e., a SelectInst).
	if (e - i != 1) {
	Inst->insertBefore(&I);
	UDivActions[i].FoldResult = Inst;
	} else
	return Inst;
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
	if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	// Handle the integer div common cases
	if (Instruction *Common = commonIDivTransforms(I))
	return Common;

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Value *X;
	// sdiv Op0, -1 --> -Op0
	// sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
	if (match(Op1, m_AllOnes()) \|\|
	(match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
	return BinaryOperator::CreateNeg(Op0);

	// X / INT_MIN --> X == INT_MIN
	if (match(Op1, m_SignMask()))
	return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), I.getType());

	const APInt *Op1C;
	if (match(Op1, m_APInt(Op1C))) {
	// sdiv exact X, C --> ashr exact X, log2(C)
	if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) {
	Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2());
	return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName());
	}

	// If the dividend is sign-extended and the constant divisor is small enough
	// to fit in the source type, shrink the division to the narrower type:
	// (sext X) sdiv C --> sext (X sdiv C)
	Value *Op0Src;
	if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) &&
	Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) {

	// In the general case, we need to make sure that the dividend is not the
	// minimum signed value because dividing that by -1 is UB. But here, we
	// know that the -1 divisor case is already handled above.

	Constant *NarrowDivisor =
	ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
	Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
	return new SExtInst(NarrowOp, Op0->getType());
	}

	// -X / C --> X / -C (if the negation doesn't overflow).
	// TODO: This could be enhanced to handle arbitrary vector constants by
	// checking if all elements are not the min-signed-val.
	if (!Op1C->isMinSignedValue() &&
	match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
	Constant NegC = ConstantInt::get(I.getType(), -(Op1C));
	Instruction *BO = BinaryOperator::CreateSDiv(X, NegC);
	BO->setIsExact(I.isExact());
	return BO;
	}
	}

	// -X / Y --> -(X / Y)
	Value *Y;
	if (match(&I, m_SDiv(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
	return BinaryOperator::CreateNSWNeg(
	Builder.CreateSDiv(X, Y, I.getName(), I.isExact()));

	// If the sign bits of both operands are zero (i.e. we can prove they are
	// unsigned inputs), turn this into a udiv.
	APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
	if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
	if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
	// X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
	auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
	BO->setIsExact(I.isExact());
	return BO;
	}

	if (isKnownToBeAPowerOfTwo(Op1, /OrZero/ true, 0, &I)) {
	// X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
	// Safe because the only negative value (1 << Y) can take on is
	// INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
	// the sign bit set.
	auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
	BO->setIsExact(I.isExact());
	return BO;
	}
	}

	return nullptr;
	}

	/// Remove negation and try to convert division into multiplication.
	static Instruction *foldFDivConstantDivisor(BinaryOperator &I) {
	Constant *C;
	if (!match(I.getOperand(1), m_Constant(C)))
	return nullptr;

	// -X / C --> X / -C
	Value *X;
	if (match(I.getOperand(0), m_FNeg(m_Value(X))))
	return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);

	// If the constant divisor has an exact inverse, this is always safe. If not,
	// then we can still create a reciprocal if fast-math-flags allow it and the
	// constant is a regular number (not zero, infinite, or denormal).
	if (!(C->hasExactInverseFP() \|\| (I.hasAllowReciprocal() && C->isNormalFP())))
	return nullptr;

	// Disallow denormal constants because we don't know what would happen
	// on all targets.
	// TODO: Use Intrinsic::canonicalize or let function attributes tell us that
	// denorms are flushed?
	auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C);
	if (!RecipC->isNormalFP())
	return nullptr;

	// X / C --> X * (1 / C)
	return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I);
	}

	/// Remove negation and try to reassociate constant math.
	static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
	Constant *C;
	if (!match(I.getOperand(0), m_Constant(C)))
	return nullptr;

	// C / -X --> -C / X
	Value *X;
	if (match(I.getOperand(1), m_FNeg(m_Value(X))))
	return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);

	if (!I.hasAllowReassoc() \|\| !I.hasAllowReciprocal())
	return nullptr;

	// Try to reassociate C / X expressions where X includes another constant.
	Constant C2, NewC = nullptr;
	if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) {
	// C / (X * C2) --> (C / C2) / X
	NewC = ConstantExpr::getFDiv(C, C2);
	} else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) {
	// C / (X / C2) --> (C * C2) / X
	NewC = ConstantExpr::getFMul(C, C2);
	}
	// Disallow denormal constants because we don't know what would happen
	// on all targets.
	// TODO: Use Intrinsic::canonicalize or let function attributes tell us that
	// denorms are flushed?
	if (!NewC \|\| !NewC->isNormalFP())
	return nullptr;

	return BinaryOperator::CreateFDivFMF(NewC, X, &I);
	}

	Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
	if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
	I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	if (Instruction *R = foldFDivConstantDivisor(I))
	return R;

	if (Instruction *R = foldFDivConstantDividend(I))
	return R;

	if (Instruction *R = foldFPSignBitOps(I))
	return R;

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	if (isa<Constant>(Op0))
	if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
	if (Instruction *R = FoldOpIntoSelect(I, SI))
	return R;

	if (isa<Constant>(Op1))
	if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
	if (Instruction *R = FoldOpIntoSelect(I, SI))
	return R;

	if (I.hasAllowReassoc() && I.hasAllowReciprocal()) {
	Value X, Y;
	if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
	(!isa<Constant>(Y) \|\| !isa<Constant>(Op1))) {
	// (X / Y) / Z => X / (Y * Z)
	Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I);
	return BinaryOperator::CreateFDivFMF(X, YZ, &I);
	}
	if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
	(!isa<Constant>(Y) \|\| !isa<Constant>(Op0))) {
	// Z / (X / Y) => (Y * Z) / X
	Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
	return BinaryOperator::CreateFDivFMF(YZ, X, &I);
	}
	// Z / (1.0 / Y) => (Y * Z)
	//
	// This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
	// m_OneUse check is avoided because even in the case of the multiple uses
	// for 1.0/Y, the number of instructions remain the same and a division is
	// replaced by a multiplication.
	if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
	return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
	}

	if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
	// sin(X) / cos(X) -> tan(X)
	// cos(X) / sin(X) -> 1/tan(X) (cotangent)
	Value *X;
	bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) &&
	match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X)));
	bool IsCot =
	!IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
	match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));

	if ((IsTan \|\| IsCot) &&
	hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
	IRBuilder<> B(&I);
	IRBuilder<>::FastMathFlagGuard FMFGuard(B);
	B.setFastMathFlags(I.getFastMathFlags());
	AttributeList Attrs =
	cast<CallBase>(Op0)->getCalledFunction()->getAttributes();
	Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
	LibFunc_tanl, B, Attrs);
	if (IsCot)
	Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
	return replaceInstUsesWith(I, Res);
	}
	}

	// X / (X * Y) --> 1.0 / Y
	// Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
	// We can ignore the possibility that X is infinity because INF/INF is NaN.
	Value X, Y;
	if (I.hasNoNaNs() && I.hasAllowReassoc() &&
	match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
	replaceOperand(I, 0, ConstantFP::get(I.getType(), 1.0));
	replaceOperand(I, 1, Y);
	return &I;
	}

	// X / fabs(X) -> copysign(1.0, X)
	// fabs(X) / X -> copysign(1.0, X)
	if (I.hasNoNaNs() && I.hasNoInfs() &&
	(match(&I,
	m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) \|\|
	match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)),
	m_Deferred(X))))) {
	Value *V = Builder.CreateBinaryIntrinsic(
	Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
	return replaceInstUsesWith(I, V);
	}
	return nullptr;
	}

	/// This function implements the transforms common to both integer remainder
	/// instructions (urem and srem). It is called by the visitors to those integer
	/// remainder instructions.
	/// Common integer remainder transforms
	Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);

	// The RHS is known non-zero.
	if (Value V = simplifyValueKnownNonZero(I.getOperand(1), this, I))
	return replaceOperand(I, 1, V);

	// Handle cases involving: rem X, (select Cond, Y, Z)
	if (simplifyDivRemOfSelectWithZeroOp(I))
	return &I;

	if (isa<Constant>(Op1)) {
	if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
	if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
	if (Instruction *R = FoldOpIntoSelect(I, SI))
	return R;
	} else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
	const APInt *Op1Int;
	if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
	(I.getOpcode() == Instruction::URem \|\|
	!Op1Int->isMinSignedValue())) {
	// foldOpIntoPhi will speculate instructions to the end of the PHI's
	// predecessor blocks, so do this only if we know the srem or urem
	// will not fault.
	if (Instruction *NV = foldOpIntoPhi(I, PN))
	return NV;
	}
	}

	// See if we can fold away this rem instruction.
	if (SimplifyDemandedInstructionBits(I))
	return &I;
	}
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitURem(BinaryOperator &I) {
	if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	if (Instruction *common = commonIRemTransforms(I))
	return common;

	if (Instruction *NarrowRem = narrowUDivURem(I, Builder))
	return NarrowRem;

	// X urem Y -> X and Y-1, where Y is a power of 2,
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Type *Ty = I.getType();
	if (isKnownToBeAPowerOfTwo(Op1, /OrZero/ true, 0, &I)) {
	// This may increase instruction count, we don't enforce that Y is a
	// constant.
	Constant *N1 = Constant::getAllOnesValue(Ty);
	Value *Add = Builder.CreateAdd(Op1, N1);
	return BinaryOperator::CreateAnd(Op0, Add);
	}

	// 1 urem X -> zext(X != 1)
	if (match(Op0, m_One())) {
	Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1));
	return CastInst::CreateZExtOrBitCast(Cmp, Ty);
	}

	// X urem C -> X < C ? X : X - C, where C >= signbit.
	if (match(Op1, m_Negative())) {
	Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
	Value *Sub = Builder.CreateSub(Op0, Op1);
	return SelectInst::Create(Cmp, Op0, Sub);
	}

	// If the divisor is a sext of a boolean, then the divisor must be max
	// unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also
	// max unsigned value. In that case, the remainder is 0:
	// urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0
	Value *X;
	if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
	Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
	return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0);
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
	if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	// Handle the integer rem common cases
	if (Instruction *Common = commonIRemTransforms(I))
	return Common;

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	{
	const APInt *Y;
	// X % -Y -> X % Y
	if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue())
	return replaceOperand(I, 1, ConstantInt::get(I.getType(), -*Y));
	}

	// -X srem Y --> -(X srem Y)
	Value X, Y;
	if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
	return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y));

	// If the sign bits of both operands are zero (i.e. we can prove they are
	// unsigned inputs), turn this into a urem.
	APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
	if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
	MaskedValueIsZero(Op0, Mask, 0, &I)) {
	// X srem Y -> X urem Y, iff X and Y don't have sign bit set
	return BinaryOperator::CreateURem(Op0, Op1, I.getName());
	}

	// If it's a constant vector, flip any negative values positive.
	if (isa<ConstantVector>(Op1) \|\| isa<ConstantDataVector>(Op1)) {
	Constant *C = cast<Constant>(Op1);
	unsigned VWidth = cast<VectorType>(C->getType())->getNumElements();

	bool hasNegative = false;
	bool hasMissing = false;
	for (unsigned i = 0; i != VWidth; ++i) {
	Constant *Elt = C->getAggregateElement(i);
	if (!Elt) {
	hasMissing = true;
	break;
	}

	if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
	if (RHS->isNegative())
	hasNegative = true;
	}

	if (hasNegative && !hasMissing) {
	SmallVector<Constant *, 16> Elts(VWidth);
	for (unsigned i = 0; i != VWidth; ++i) {
	Elts[i] = C->getAggregateElement(i); // Handle undef, etc.
	if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
	if (RHS->isNegative())
	Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
	}
	}

	Constant *NewRHSV = ConstantVector::get(Elts);
	if (NewRHSV != C) // Don't loop on -MININT
	return replaceOperand(I, 1, NewRHSV);
	}
	}

	return nullptr;
	}

	Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
	if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
	I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	if (Instruction *X = foldVectorBinop(I))
	return X;

	return nullptr;
	}
	diff --git a/lib/clang/include/VCSVersion.inc b/lib/clang/include/VCSVersion.inc
	index 7e28d96de683..aa4d397f11b4 100644
	--- a/lib/clang/include/VCSVersion.inc
	+++ b/lib/clang/include/VCSVersion.inc
	@@ -1,14 +1,14 @@
	// $FreeBSD$

	-#define LLVM_REVISION "llvmorg-11.0.0-rc1-47-gff47911ddfc"
	+#define LLVM_REVISION "llvmorg-11.0.0-rc2-0-g414f32a9e86"
	#define LLVM_REPOSITORY "git@github.com:llvm/llvm-project.git"

	-#define CLANG_REVISION "llvmorg-11.0.0-rc1-47-gff47911ddfc"
	+#define CLANG_REVISION "llvmorg-11.0.0-rc2-0-g414f32a9e86"
	#define CLANG_REPOSITORY "git@github.com:llvm/llvm-project.git"

	// <Upstream revision at import>-<Local identifier in __FreeBSD_version style>
	-#define LLD_REVISION "llvmorg-11.0.0-rc1-47-gff47911ddfc-1200012"
	+#define LLD_REVISION "llvmorg-11.0.0-rc2-0-g414f32a9e86-1200012"
	#define LLD_REPOSITORY "FreeBSD"

	-#define LLDB_REVISION "llvmorg-11.0.0-rc1-47-gff47911ddfc"
	+#define LLDB_REVISION "llvmorg-11.0.0-rc2-0-g414f32a9e86"
	#define LLDB_REPOSITORY "git@github.com:llvm/llvm-project.git"
	diff --git a/lib/clang/include/llvm/Support/VCSRevision.h b/lib/clang/include/llvm/Support/VCSRevision.h
	index 28cef1ec5e77..48644d130a8d 100644
	--- a/lib/clang/include/llvm/Support/VCSRevision.h
	+++ b/lib/clang/include/llvm/Support/VCSRevision.h
	@@ -1,3 +1,3 @@
	/* $FreeBSD$ */
	-#define LLVM_REVISION "llvmorg-11.0.0-rc1-47-gff47911ddfc"
	+#define LLVM_REVISION "llvmorg-11.0.0-rc2-0-g414f32a9e86"
	#define LLVM_REPOSITORY "git@github.com:llvm/llvm-project.git"

File Metadata

Mime Type: application/octet-stream
Expires: Sat, Feb 22, 4:47 AM (2 d)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: rBQ_fNJJj3gx
Default Alt Text: (4 MB)

Offset	End	Complete
0	4194304	Yes
4194304	4847723	Yes

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions